From 14bb08c1df8db9ec6c8a05520d4eee67971235d9 Mon Sep 17 00:00:00 2001 From: Dimitri Sokolyuk Date: Thu, 27 Sep 2018 20:03:23 +0200 Subject: mod tidy --- vendor/golang.org/x/image/vector/acc_amd64.go | 34 - vendor/golang.org/x/image/vector/acc_amd64.s | 1083 -------------------- vendor/golang.org/x/image/vector/acc_other.go | 17 - vendor/golang.org/x/image/vector/gen.go | 447 -------- .../golang.org/x/image/vector/gen_acc_amd64.s.tmpl | 171 ---- vendor/golang.org/x/image/vector/raster_fixed.go | 327 ------ .../golang.org/x/image/vector/raster_floating.go | 220 ---- vendor/golang.org/x/image/vector/vector.go | 472 --------- 8 files changed, 2771 deletions(-) delete mode 100644 vendor/golang.org/x/image/vector/acc_amd64.go delete mode 100644 vendor/golang.org/x/image/vector/acc_amd64.s delete mode 100644 vendor/golang.org/x/image/vector/acc_other.go delete mode 100644 vendor/golang.org/x/image/vector/gen.go delete mode 100644 vendor/golang.org/x/image/vector/gen_acc_amd64.s.tmpl delete mode 100644 vendor/golang.org/x/image/vector/raster_fixed.go delete mode 100644 vendor/golang.org/x/image/vector/raster_floating.go delete mode 100644 vendor/golang.org/x/image/vector/vector.go (limited to 'vendor/golang.org/x/image/vector') diff --git a/vendor/golang.org/x/image/vector/acc_amd64.go b/vendor/golang.org/x/image/vector/acc_amd64.go deleted file mode 100644 index 68f6e03..0000000 --- a/vendor/golang.org/x/image/vector/acc_amd64.go +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine -// +build gc -// +build go1.6 -// +build !noasm - -package vector - -func haveSSE4_1() bool - -var haveFixedAccumulateSIMD = haveSSE4_1() - -const haveFloatingAccumulateSIMD = true - -//go:noescape -func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) - -//go:noescape -func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) - -//go:noescape -func fixedAccumulateMaskSIMD(buf []uint32) - -//go:noescape -func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) - -//go:noescape -func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) - -//go:noescape -func floatingAccumulateMaskSIMD(dst []uint32, src []float32) diff --git a/vendor/golang.org/x/image/vector/acc_amd64.s b/vendor/golang.org/x/image/vector/acc_amd64.s deleted file mode 100644 index 6a424bc..0000000 --- a/vendor/golang.org/x/image/vector/acc_amd64.s +++ /dev/null @@ -1,1083 +0,0 @@ -// generated by go run gen.go; DO NOT EDIT - -// +build !appengine -// +build gc -// +build go1.6 -// +build !noasm - -#include "textflag.h" - -// fl is short for floating point math. fx is short for fixed point math. - -DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff -DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff -DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000 -DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000 -DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff -DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff - -// scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an -// XMM register to the low byte of that register's four uint32 values. It -// duplicates those bytes, effectively multiplying each uint32 by 0x101. -// -// It transforms a little-endian 16-byte XMM value from -// ijkl???????????? -// to -// ii00jj00kk00ll00 -DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000 -DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202 - -// gather is a PSHUFB mask that brings the second-lowest byte of the XMM -// register's four uint32 values to the low four bytes of that register. -// -// It transforms a little-endian 16-byte XMM value from -// ?i???j???k???l?? -// to -// ijkl000000000000 -DATA gather<>+0x00(SB)/8, $0x808080800d090501 -DATA gather<>+0x08(SB)/8, $0x8080808080808080 - -DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff -DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff -DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001 -DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001 - -GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16 -GLOBL flOne<>(SB), (NOPTR+RODATA), $16 -GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 -GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16 -GLOBL gather<>(SB), (NOPTR+RODATA), $16 -GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16 -GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16 - -// func haveSSE4_1() bool -TEXT ·haveSSE4_1(SB), NOSPLIT, $0 - MOVQ $1, AX - CPUID - SHRQ $19, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - -// ---------------------------------------------------------------------------- - -// func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) -// -// XMM registers. Variable names are per -// https://github.com/google/font-rs/blob/master/src/accumulate.c -// -// xmm0 scratch -// xmm1 x -// xmm2 y, z -// xmm3 - -// xmm4 - -// xmm5 fxAlmost65536 -// xmm6 gather -// xmm7 offset -// xmm8 scatterAndMulBy0x101 -// xmm9 fxAlmost65536 -// xmm10 inverseFFFF -TEXT ·fixedAccumulateOpOverSIMD(SB), NOSPLIT, $0-48 - - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), BX - MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), R10 - - // Sanity check that len(dst) >= len(src). - CMPQ BX, R10 - JLT fxAccOpOverEnd - - // R10 = len(src) &^ 3 - // R11 = len(src) - MOVQ R10, R11 - ANDQ $-4, R10 - - // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. - MOVOU fxAlmost65536<>(SB), X5 - - // gather := XMM(see above) // PSHUFB shuffle mask. - // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. - // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. - // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. - MOVOU gather<>(SB), X6 - MOVOU scatterAndMulBy0x101<>(SB), X8 - MOVOU fxAlmost65536<>(SB), X9 - MOVOU inverseFFFF<>(SB), X10 - - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - XORPS X7, X7 - - // i := 0 - MOVQ $0, R9 - -fxAccOpOverLoop4: - // for i < (len(src) &^ 3) - CMPQ R9, R10 - JAE fxAccOpOverLoop1 - - // x = XMM(s0, s1, s2, s3) - // - // Where s0 is src[i+0], s1 is src[i+1], etc. - MOVOU (SI), X1 - - // scratch = XMM(0, s0, s1, s2) - // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) - MOVOU X1, X0 - PSLLO $4, X0 - PADDD X0, X1 - - // scratch = XMM(0, 0, 0, 0) - // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) - // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) - XORPS X0, X0 - SHUFPS $0x40, X1, X0 - PADDD X0, X1 - - // x += offset - PADDD X7, X1 - - // y = abs(x) - // y >>= 2 // Shift by 2*ϕ - 16. - // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 - - // z = convertToInt32(y) - // No-op. - - // Blend over the dst's prior value. SIMD for i in 0..3: - // - // dstA := uint32(dst[i]) * 0x101 - // maskA := z@i - // outA := dstA*(0xffff-maskA)/0xffff + maskA - // dst[i] = uint8(outA >> 8) - // - // First, set X0 to dstA*(0xfff-maskA). - MOVL (DI), X0 - PSHUFB X8, X0 - MOVOU X9, X11 - PSUBL X2, X11 - PMULLD X11, X0 - - // We implement uint32 division by 0xffff as multiplication by a magic - // constant (0x800080001) and then a shift by a magic constant (47). - // See TestDivideByFFFF for a justification. - // - // That multiplication widens from uint32 to uint64, so we have to - // duplicate and shift our four uint32s from one XMM register (X0) to - // two XMM registers (X0 and X11). - // - // Move the second and fourth uint32s in X0 to be the first and third - // uint32s in X11. - MOVOU X0, X11 - PSRLQ $32, X11 - - // Multiply by magic, shift by magic. - // - // pmuludq %xmm10,%xmm0 - // pmuludq %xmm10,%xmm11 - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda - PSRLQ $47, X0 - PSRLQ $47, X11 - - // Merge the two registers back to one, X11, and add maskA. - PSLLQ $32, X11 - XORPS X0, X11 - PADDD X11, X2 - - // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes. - PSHUFB X6, X2 - MOVL X2, (DI) - - // offset = XMM(x@3, x@3, x@3, x@3) - MOVOU X1, X7 - SHUFPS $0xff, X1, X7 - - // i += 4 - // dst = dst[4:] - // src = src[4:] - ADDQ $4, R9 - ADDQ $4, DI - ADDQ $16, SI - JMP fxAccOpOverLoop4 - -fxAccOpOverLoop1: - // for i < len(src) - CMPQ R9, R11 - JAE fxAccOpOverEnd - - // x = src[i] + offset - MOVL (SI), X1 - PADDD X7, X1 - - // y = abs(x) - // y >>= 2 // Shift by 2*ϕ - 16. - // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 - - // z = convertToInt32(y) - // No-op. - - // Blend over the dst's prior value. - // - // dstA := uint32(dst[0]) * 0x101 - // maskA := z - // outA := dstA*(0xffff-maskA)/0xffff + maskA - // dst[0] = uint8(outA >> 8) - MOVBLZX (DI), R12 - IMULL $0x101, R12 - MOVL X2, R13 - MOVL $0xffff, AX - SUBL R13, AX - MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. - MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... - MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. - SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). - ADDL DX, R13 - SHRL $8, R13 - MOVB R13, (DI) - - // offset = x - MOVOU X1, X7 - - // i += 1 - // dst = dst[1:] - // src = src[1:] - ADDQ $1, R9 - ADDQ $1, DI - ADDQ $4, SI - JMP fxAccOpOverLoop1 - -fxAccOpOverEnd: - RET - -// ---------------------------------------------------------------------------- - -// func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) -// -// XMM registers. Variable names are per -// https://github.com/google/font-rs/blob/master/src/accumulate.c -// -// xmm0 scratch -// xmm1 x -// xmm2 y, z -// xmm3 - -// xmm4 - -// xmm5 fxAlmost65536 -// xmm6 gather -// xmm7 offset -// xmm8 - -// xmm9 - -// xmm10 - -TEXT ·fixedAccumulateOpSrcSIMD(SB), NOSPLIT, $0-48 - - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), BX - MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), R10 - - // Sanity check that len(dst) >= len(src). - CMPQ BX, R10 - JLT fxAccOpSrcEnd - - // R10 = len(src) &^ 3 - // R11 = len(src) - MOVQ R10, R11 - ANDQ $-4, R10 - - // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. - MOVOU fxAlmost65536<>(SB), X5 - - // gather := XMM(see above) // PSHUFB shuffle mask. - MOVOU gather<>(SB), X6 - - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - XORPS X7, X7 - - // i := 0 - MOVQ $0, R9 - -fxAccOpSrcLoop4: - // for i < (len(src) &^ 3) - CMPQ R9, R10 - JAE fxAccOpSrcLoop1 - - // x = XMM(s0, s1, s2, s3) - // - // Where s0 is src[i+0], s1 is src[i+1], etc. - MOVOU (SI), X1 - - // scratch = XMM(0, s0, s1, s2) - // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) - MOVOU X1, X0 - PSLLO $4, X0 - PADDD X0, X1 - - // scratch = XMM(0, 0, 0, 0) - // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) - // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) - XORPS X0, X0 - SHUFPS $0x40, X1, X0 - PADDD X0, X1 - - // x += offset - PADDD X7, X1 - - // y = abs(x) - // y >>= 2 // Shift by 2*ϕ - 16. - // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 - - // z = convertToInt32(y) - // No-op. - - // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z) - // copy(dst[:4], low4BytesOf(z)) - PSHUFB X6, X2 - MOVL X2, (DI) - - // offset = XMM(x@3, x@3, x@3, x@3) - MOVOU X1, X7 - SHUFPS $0xff, X1, X7 - - // i += 4 - // dst = dst[4:] - // src = src[4:] - ADDQ $4, R9 - ADDQ $4, DI - ADDQ $16, SI - JMP fxAccOpSrcLoop4 - -fxAccOpSrcLoop1: - // for i < len(src) - CMPQ R9, R11 - JAE fxAccOpSrcEnd - - // x = src[i] + offset - MOVL (SI), X1 - PADDD X7, X1 - - // y = abs(x) - // y >>= 2 // Shift by 2*ϕ - 16. - // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 - - // z = convertToInt32(y) - // No-op. - - // dst[0] = uint8(z>>8) - MOVL X2, BX - SHRL $8, BX - MOVB BX, (DI) - - // offset = x - MOVOU X1, X7 - - // i += 1 - // dst = dst[1:] - // src = src[1:] - ADDQ $1, R9 - ADDQ $1, DI - ADDQ $4, SI - JMP fxAccOpSrcLoop1 - -fxAccOpSrcEnd: - RET - -// ---------------------------------------------------------------------------- - -// func fixedAccumulateMaskSIMD(buf []uint32) -// -// XMM registers. Variable names are per -// https://github.com/google/font-rs/blob/master/src/accumulate.c -// -// xmm0 scratch -// xmm1 x -// xmm2 y, z -// xmm3 - -// xmm4 - -// xmm5 fxAlmost65536 -// xmm6 - -// xmm7 offset -// xmm8 - -// xmm9 - -// xmm10 - -TEXT ·fixedAccumulateMaskSIMD(SB), NOSPLIT, $0-24 - - MOVQ buf_base+0(FP), DI - MOVQ buf_len+8(FP), BX - MOVQ buf_base+0(FP), SI - MOVQ buf_len+8(FP), R10 - - // R10 = len(src) &^ 3 - // R11 = len(src) - MOVQ R10, R11 - ANDQ $-4, R10 - - // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. - MOVOU fxAlmost65536<>(SB), X5 - - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - XORPS X7, X7 - - // i := 0 - MOVQ $0, R9 - -fxAccMaskLoop4: - // for i < (len(src) &^ 3) - CMPQ R9, R10 - JAE fxAccMaskLoop1 - - // x = XMM(s0, s1, s2, s3) - // - // Where s0 is src[i+0], s1 is src[i+1], etc. - MOVOU (SI), X1 - - // scratch = XMM(0, s0, s1, s2) - // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) - MOVOU X1, X0 - PSLLO $4, X0 - PADDD X0, X1 - - // scratch = XMM(0, 0, 0, 0) - // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) - // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) - XORPS X0, X0 - SHUFPS $0x40, X1, X0 - PADDD X0, X1 - - // x += offset - PADDD X7, X1 - - // y = abs(x) - // y >>= 2 // Shift by 2*ϕ - 16. - // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 - - // z = convertToInt32(y) - // No-op. - - // copy(dst[:4], z) - MOVOU X2, (DI) - - // offset = XMM(x@3, x@3, x@3, x@3) - MOVOU X1, X7 - SHUFPS $0xff, X1, X7 - - // i += 4 - // dst = dst[4:] - // src = src[4:] - ADDQ $4, R9 - ADDQ $16, DI - ADDQ $16, SI - JMP fxAccMaskLoop4 - -fxAccMaskLoop1: - // for i < len(src) - CMPQ R9, R11 - JAE fxAccMaskEnd - - // x = src[i] + offset - MOVL (SI), X1 - PADDD X7, X1 - - // y = abs(x) - // y >>= 2 // Shift by 2*ϕ - 16. - // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 - - // z = convertToInt32(y) - // No-op. - - // dst[0] = uint32(z) - MOVL X2, (DI) - - // offset = x - MOVOU X1, X7 - - // i += 1 - // dst = dst[1:] - // src = src[1:] - ADDQ $1, R9 - ADDQ $4, DI - ADDQ $4, SI - JMP fxAccMaskLoop1 - -fxAccMaskEnd: - RET - -// ---------------------------------------------------------------------------- - -// func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) -// -// XMM registers. Variable names are per -// https://github.com/google/font-rs/blob/master/src/accumulate.c -// -// xmm0 scratch -// xmm1 x -// xmm2 y, z -// xmm3 flSignMask -// xmm4 flOne -// xmm5 flAlmost65536 -// xmm6 gather -// xmm7 offset -// xmm8 scatterAndMulBy0x101 -// xmm9 fxAlmost65536 -// xmm10 inverseFFFF -TEXT ·floatingAccumulateOpOverSIMD(SB), NOSPLIT, $8-48 - - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), BX - MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), R10 - - // Sanity check that len(dst) >= len(src). - CMPQ BX, R10 - JLT flAccOpOverEnd - - // R10 = len(src) &^ 3 - // R11 = len(src) - MOVQ R10, R11 - ANDQ $-4, R10 - - // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is - // "Round To Zero". - STMXCSR mxcsrOrig-8(SP) - MOVL mxcsrOrig-8(SP), AX - ORL $0x6000, AX - MOVL AX, mxcsrNew-4(SP) - - // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. - // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. - // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. - MOVOU flSignMask<>(SB), X3 - MOVOU flOne<>(SB), X4 - MOVOU flAlmost65536<>(SB), X5 - - // gather := XMM(see above) // PSHUFB shuffle mask. - // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. - // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. - // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. - MOVOU gather<>(SB), X6 - MOVOU scatterAndMulBy0x101<>(SB), X8 - MOVOU fxAlmost65536<>(SB), X9 - MOVOU inverseFFFF<>(SB), X10 - - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - XORPS X7, X7 - - // i := 0 - MOVQ $0, R9 - -flAccOpOverLoop4: - // for i < (len(src) &^ 3) - CMPQ R9, R10 - JAE flAccOpOverLoop1 - - // x = XMM(s0, s1, s2, s3) - // - // Where s0 is src[i+0], s1 is src[i+1], etc. - MOVOU (SI), X1 - - // scratch = XMM(0, s0, s1, s2) - // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) - MOVOU X1, X0 - PSLLO $4, X0 - ADDPS X0, X1 - - // scratch = XMM(0, 0, 0, 0) - // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) - // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) - XORPS X0, X0 - SHUFPS $0x40, X1, X0 - ADDPS X0, X1 - - // x += offset - ADDPS X7, X1 - - // y = x & flSignMask - // y = min(y, flOne) - // y = mul(y, flAlmost65536) - MOVOU X3, X2 - ANDPS X1, X2 - MINPS X4, X2 - MULPS X5, X2 - - // z = convertToInt32(y) - LDMXCSR mxcsrNew-4(SP) - CVTPS2PL X2, X2 - LDMXCSR mxcsrOrig-8(SP) - - // Blend over the dst's prior value. SIMD for i in 0..3: - // - // dstA := uint32(dst[i]) * 0x101 - // maskA := z@i - // outA := dstA*(0xffff-maskA)/0xffff + maskA - // dst[i] = uint8(outA >> 8) - // - // First, set X0 to dstA*(0xfff-maskA). - MOVL (DI), X0 - PSHUFB X8, X0 - MOVOU X9, X11 - PSUBL X2, X11 - PMULLD X11, X0 - - // We implement uint32 division by 0xffff as multiplication by a magic - // constant (0x800080001) and then a shift by a magic constant (47). - // See TestDivideByFFFF for a justification. - // - // That multiplication widens from uint32 to uint64, so we have to - // duplicate and shift our four uint32s from one XMM register (X0) to - // two XMM registers (X0 and X11). - // - // Move the second and fourth uint32s in X0 to be the first and third - // uint32s in X11. - MOVOU X0, X11 - PSRLQ $32, X11 - - // Multiply by magic, shift by magic. - // - // pmuludq %xmm10,%xmm0 - // pmuludq %xmm10,%xmm11 - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda - PSRLQ $47, X0 - PSRLQ $47, X11 - - // Merge the two registers back to one, X11, and add maskA. - PSLLQ $32, X11 - XORPS X0, X11 - PADDD X11, X2 - - // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes. - PSHUFB X6, X2 - MOVL X2, (DI) - - // offset = XMM(x@3, x@3, x@3, x@3) - MOVOU X1, X7 - SHUFPS $0xff, X1, X7 - - // i += 4 - // dst = dst[4:] - // src = src[4:] - ADDQ $4, R9 - ADDQ $4, DI - ADDQ $16, SI - JMP flAccOpOverLoop4 - -flAccOpOverLoop1: - // for i < len(src) - CMPQ R9, R11 - JAE flAccOpOverEnd - - // x = src[i] + offset - MOVL (SI), X1 - ADDPS X7, X1 - - // y = x & flSignMask - // y = min(y, flOne) - // y = mul(y, flAlmost65536) - MOVOU X3, X2 - ANDPS X1, X2 - MINPS X4, X2 - MULPS X5, X2 - - // z = convertToInt32(y) - LDMXCSR mxcsrNew-4(SP) - CVTPS2PL X2, X2 - LDMXCSR mxcsrOrig-8(SP) - - // Blend over the dst's prior value. - // - // dstA := uint32(dst[0]) * 0x101 - // maskA := z - // outA := dstA*(0xffff-maskA)/0xffff + maskA - // dst[0] = uint8(outA >> 8) - MOVBLZX (DI), R12 - IMULL $0x101, R12 - MOVL X2, R13 - MOVL $0xffff, AX - SUBL R13, AX - MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. - MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... - MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. - SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). - ADDL DX, R13 - SHRL $8, R13 - MOVB R13, (DI) - - // offset = x - MOVOU X1, X7 - - // i += 1 - // dst = dst[1:] - // src = src[1:] - ADDQ $1, R9 - ADDQ $1, DI - ADDQ $4, SI - JMP flAccOpOverLoop1 - -flAccOpOverEnd: - RET - -// ---------------------------------------------------------------------------- - -// func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) -// -// XMM registers. Variable names are per -// https://github.com/google/font-rs/blob/master/src/accumulate.c -// -// xmm0 scratch -// xmm1 x -// xmm2 y, z -// xmm3 flSignMask -// xmm4 flOne -// xmm5 flAlmost65536 -// xmm6 gather -// xmm7 offset -// xmm8 - -// xmm9 - -// xmm10 - -TEXT ·floatingAccumulateOpSrcSIMD(SB), NOSPLIT, $8-48 - - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), BX - MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), R10 - - // Sanity check that len(dst) >= len(src). - CMPQ BX, R10 - JLT flAccOpSrcEnd - - // R10 = len(src) &^ 3 - // R11 = len(src) - MOVQ R10, R11 - ANDQ $-4, R10 - - // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is - // "Round To Zero". - STMXCSR mxcsrOrig-8(SP) - MOVL mxcsrOrig-8(SP), AX - ORL $0x6000, AX - MOVL AX, mxcsrNew-4(SP) - - // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. - // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. - // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. - MOVOU flSignMask<>(SB), X3 - MOVOU flOne<>(SB), X4 - MOVOU flAlmost65536<>(SB), X5 - - // gather := XMM(see above) // PSHUFB shuffle mask. - MOVOU gather<>(SB), X6 - - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - XORPS X7, X7 - - // i := 0 - MOVQ $0, R9 - -flAccOpSrcLoop4: - // for i < (len(src) &^ 3) - CMPQ R9, R10 - JAE flAccOpSrcLoop1 - - // x = XMM(s0, s1, s2, s3) - // - // Where s0 is src[i+0], s1 is src[i+1], etc. - MOVOU (SI), X1 - - // scratch = XMM(0, s0, s1, s2) - // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) - MOVOU X1, X0 - PSLLO $4, X0 - ADDPS X0, X1 - - // scratch = XMM(0, 0, 0, 0) - // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) - // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) - XORPS X0, X0 - SHUFPS $0x40, X1, X0 - ADDPS X0, X1 - - // x += offset - ADDPS X7, X1 - - // y = x & flSignMask - // y = min(y, flOne) - // y = mul(y, flAlmost65536) - MOVOU X3, X2 - ANDPS X1, X2 - MINPS X4, X2 - MULPS X5, X2 - - // z = convertToInt32(y) - LDMXCSR mxcsrNew-4(SP) - CVTPS2PL X2, X2 - LDMXCSR mxcsrOrig-8(SP) - - // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z) - // copy(dst[:4], low4BytesOf(z)) - PSHUFB X6, X2 - MOVL X2, (DI) - - // offset = XMM(x@3, x@3, x@3, x@3) - MOVOU X1, X7 - SHUFPS $0xff, X1, X7 - - // i += 4 - // dst = dst[4:] - // src = src[4:] - ADDQ $4, R9 - ADDQ $4, DI - ADDQ $16, SI - JMP flAccOpSrcLoop4 - -flAccOpSrcLoop1: - // for i < len(src) - CMPQ R9, R11 - JAE flAccOpSrcEnd - - // x = src[i] + offset - MOVL (SI), X1 - ADDPS X7, X1 - - // y = x & flSignMask - // y = min(y, flOne) - // y = mul(y, flAlmost65536) - MOVOU X3, X2 - ANDPS X1, X2 - MINPS X4, X2 - MULPS X5, X2 - - // z = convertToInt32(y) - LDMXCSR mxcsrNew-4(SP) - CVTPS2PL X2, X2 - LDMXCSR mxcsrOrig-8(SP) - - // dst[0] = uint8(z>>8) - MOVL X2, BX - SHRL $8, BX - MOVB BX, (DI) - - // offset = x - MOVOU X1, X7 - - // i += 1 - // dst = dst[1:] - // src = src[1:] - ADDQ $1, R9 - ADDQ $1, DI - ADDQ $4, SI - JMP flAccOpSrcLoop1 - -flAccOpSrcEnd: - RET - -// ---------------------------------------------------------------------------- - -// func floatingAccumulateMaskSIMD(dst []uint32, src []float32) -// -// XMM registers. Variable names are per -// https://github.com/google/font-rs/blob/master/src/accumulate.c -// -// xmm0 scratch -// xmm1 x -// xmm2 y, z -// xmm3 flSignMask -// xmm4 flOne -// xmm5 flAlmost65536 -// xmm6 - -// xmm7 offset -// xmm8 - -// xmm9 - -// xmm10 - -TEXT ·floatingAccumulateMaskSIMD(SB), NOSPLIT, $8-48 - - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), BX - MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), R10 - - // Sanity check that len(dst) >= len(src). - CMPQ BX, R10 - JLT flAccMaskEnd - - // R10 = len(src) &^ 3 - // R11 = len(src) - MOVQ R10, R11 - ANDQ $-4, R10 - - // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is - // "Round To Zero". - STMXCSR mxcsrOrig-8(SP) - MOVL mxcsrOrig-8(SP), AX - ORL $0x6000, AX - MOVL AX, mxcsrNew-4(SP) - - // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. - // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. - // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. - MOVOU flSignMask<>(SB), X3 - MOVOU flOne<>(SB), X4 - MOVOU flAlmost65536<>(SB), X5 - - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - XORPS X7, X7 - - // i := 0 - MOVQ $0, R9 - -flAccMaskLoop4: - // for i < (len(src) &^ 3) - CMPQ R9, R10 - JAE flAccMaskLoop1 - - // x = XMM(s0, s1, s2, s3) - // - // Where s0 is src[i+0], s1 is src[i+1], etc. - MOVOU (SI), X1 - - // scratch = XMM(0, s0, s1, s2) - // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) - MOVOU X1, X0 - PSLLO $4, X0 - ADDPS X0, X1 - - // scratch = XMM(0, 0, 0, 0) - // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) - // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) - XORPS X0, X0 - SHUFPS $0x40, X1, X0 - ADDPS X0, X1 - - // x += offset - ADDPS X7, X1 - - // y = x & flSignMask - // y = min(y, flOne) - // y = mul(y, flAlmost65536) - MOVOU X3, X2 - ANDPS X1, X2 - MINPS X4, X2 - MULPS X5, X2 - - // z = convertToInt32(y) - LDMXCSR mxcsrNew-4(SP) - CVTPS2PL X2, X2 - LDMXCSR mxcsrOrig-8(SP) - - // copy(dst[:4], z) - MOVOU X2, (DI) - - // offset = XMM(x@3, x@3, x@3, x@3) - MOVOU X1, X7 - SHUFPS $0xff, X1, X7 - - // i += 4 - // dst = dst[4:] - // src = src[4:] - ADDQ $4, R9 - ADDQ $16, DI - ADDQ $16, SI - JMP flAccMaskLoop4 - -flAccMaskLoop1: - // for i < len(src) - CMPQ R9, R11 - JAE flAccMaskEnd - - // x = src[i] + offset - MOVL (SI), X1 - ADDPS X7, X1 - - // y = x & flSignMask - // y = min(y, flOne) - // y = mul(y, flAlmost65536) - MOVOU X3, X2 - ANDPS X1, X2 - MINPS X4, X2 - MULPS X5, X2 - - // z = convertToInt32(y) - LDMXCSR mxcsrNew-4(SP) - CVTPS2PL X2, X2 - LDMXCSR mxcsrOrig-8(SP) - - // dst[0] = uint32(z) - MOVL X2, (DI) - - // offset = x - MOVOU X1, X7 - - // i += 1 - // dst = dst[1:] - // src = src[1:] - ADDQ $1, R9 - ADDQ $4, DI - ADDQ $4, SI - JMP flAccMaskLoop1 - -flAccMaskEnd: - RET diff --git a/vendor/golang.org/x/image/vector/acc_other.go b/vendor/golang.org/x/image/vector/acc_other.go deleted file mode 100644 index 30425be..0000000 --- a/vendor/golang.org/x/image/vector/acc_other.go +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !amd64 appengine !gc !go1.6 noasm - -package vector - -const haveFixedAccumulateSIMD = false -const haveFloatingAccumulateSIMD = false - -func fixedAccumulateOpOverSIMD(dst []uint8, src []uint32) {} -func fixedAccumulateOpSrcSIMD(dst []uint8, src []uint32) {} -func fixedAccumulateMaskSIMD(buf []uint32) {} -func floatingAccumulateOpOverSIMD(dst []uint8, src []float32) {} -func floatingAccumulateOpSrcSIMD(dst []uint8, src []float32) {} -func floatingAccumulateMaskSIMD(dst []uint32, src []float32) {} diff --git a/vendor/golang.org/x/image/vector/gen.go b/vendor/golang.org/x/image/vector/gen.go deleted file mode 100644 index 28b298b..0000000 --- a/vendor/golang.org/x/image/vector/gen.go +++ /dev/null @@ -1,447 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build ignore - -package main - -import ( - "bytes" - "io/ioutil" - "log" - "strings" - "text/template" -) - -const ( - copyright = "" + - "// Copyright 2016 The Go Authors. All rights reserved.\n" + - "// Use of this source code is governed by a BSD-style\n" + - "// license that can be found in the LICENSE file.\n" - - doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n" - - dashDashDash = "// --------" -) - -func main() { - tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl") - if err != nil { - log.Fatalf("ReadFile: %v", err) - } - if !bytes.HasPrefix(tmpl, []byte(copyright)) { - log.Fatal("source template did not start with the copyright header") - } - tmpl = tmpl[len(copyright):] - - preamble := []byte(nil) - if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 { - log.Fatalf("source template did not contain %q", dashDashDash) - } else { - preamble, tmpl = tmpl[:i], tmpl[i:] - } - - t, err := template.New("").Parse(string(tmpl)) - if err != nil { - log.Fatalf("Parse: %v", err) - } - - out := bytes.NewBuffer(nil) - out.WriteString(doNotEdit) - out.Write(preamble) - - for i, v := range instances { - if i != 0 { - out.WriteString("\n") - } - if strings.Contains(v.LoadArgs, "{{.ShortName}}") { - v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1) - } - if err := t.Execute(out, v); err != nil { - log.Fatalf("Execute(%q): %v", v.ShortName, err) - } - } - - if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil { - log.Fatalf("WriteFile: %v", err) - } -} - -var instances = []struct { - LongName string - ShortName string - FrameSize string - ArgsSize string - Args string - DstElemSize1 int - DstElemSize4 int - XMM3 string - XMM4 string - XMM5 string - XMM6 string - XMM8 string - XMM9 string - XMM10 string - LoadArgs string - Setup string - LoadXMMRegs string - Add string - ClampAndScale string - ConvertToInt32 string - Store4 string - Store1 string -}{{ - LongName: "fixedAccumulateOpOver", - ShortName: "fxAccOpOver", - FrameSize: fxFrameSize, - ArgsSize: twoArgArgsSize, - Args: "dst []uint8, src []uint32", - DstElemSize1: 1 * sizeOfUint8, - DstElemSize4: 4 * sizeOfUint8, - XMM3: fxXMM3, - XMM4: fxXMM4, - XMM5: fxXMM5, - XMM6: opOverXMM6, - XMM8: opOverXMM8, - XMM9: opOverXMM9, - XMM10: opOverXMM10, - LoadArgs: twoArgLoadArgs, - Setup: fxSetup, - LoadXMMRegs: fxLoadXMMRegs + "\n" + opOverLoadXMMRegs, - Add: fxAdd, - ClampAndScale: fxClampAndScale, - ConvertToInt32: fxConvertToInt32, - Store4: opOverStore4, - Store1: opOverStore1, -}, { - LongName: "fixedAccumulateOpSrc", - ShortName: "fxAccOpSrc", - FrameSize: fxFrameSize, - ArgsSize: twoArgArgsSize, - Args: "dst []uint8, src []uint32", - DstElemSize1: 1 * sizeOfUint8, - DstElemSize4: 4 * sizeOfUint8, - XMM3: fxXMM3, - XMM4: fxXMM4, - XMM5: fxXMM5, - XMM6: opSrcXMM6, - XMM8: opSrcXMM8, - XMM9: opSrcXMM9, - XMM10: opSrcXMM10, - LoadArgs: twoArgLoadArgs, - Setup: fxSetup, - LoadXMMRegs: fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs, - Add: fxAdd, - ClampAndScale: fxClampAndScale, - ConvertToInt32: fxConvertToInt32, - Store4: opSrcStore4, - Store1: opSrcStore1, -}, { - LongName: "fixedAccumulateMask", - ShortName: "fxAccMask", - FrameSize: fxFrameSize, - ArgsSize: oneArgArgsSize, - Args: "buf []uint32", - DstElemSize1: 1 * sizeOfUint32, - DstElemSize4: 4 * sizeOfUint32, - XMM3: fxXMM3, - XMM4: fxXMM4, - XMM5: fxXMM5, - XMM6: maskXMM6, - XMM8: maskXMM8, - XMM9: maskXMM9, - XMM10: maskXMM10, - LoadArgs: oneArgLoadArgs, - Setup: fxSetup, - LoadXMMRegs: fxLoadXMMRegs + "\n" + maskLoadXMMRegs, - Add: fxAdd, - ClampAndScale: fxClampAndScale, - ConvertToInt32: fxConvertToInt32, - Store4: maskStore4, - Store1: maskStore1, -}, { - LongName: "floatingAccumulateOpOver", - ShortName: "flAccOpOver", - FrameSize: flFrameSize, - ArgsSize: twoArgArgsSize, - Args: "dst []uint8, src []float32", - DstElemSize1: 1 * sizeOfUint8, - DstElemSize4: 4 * sizeOfUint8, - XMM3: flXMM3, - XMM4: flXMM4, - XMM5: flXMM5, - XMM6: opOverXMM6, - XMM8: opOverXMM8, - XMM9: opOverXMM9, - XMM10: opOverXMM10, - LoadArgs: twoArgLoadArgs, - Setup: flSetup, - LoadXMMRegs: flLoadXMMRegs + "\n" + opOverLoadXMMRegs, - Add: flAdd, - ClampAndScale: flClampAndScale, - ConvertToInt32: flConvertToInt32, - Store4: opOverStore4, - Store1: opOverStore1, -}, { - LongName: "floatingAccumulateOpSrc", - ShortName: "flAccOpSrc", - FrameSize: flFrameSize, - ArgsSize: twoArgArgsSize, - Args: "dst []uint8, src []float32", - DstElemSize1: 1 * sizeOfUint8, - DstElemSize4: 4 * sizeOfUint8, - XMM3: flXMM3, - XMM4: flXMM4, - XMM5: flXMM5, - XMM6: opSrcXMM6, - XMM8: opSrcXMM8, - XMM9: opSrcXMM9, - XMM10: opSrcXMM10, - LoadArgs: twoArgLoadArgs, - Setup: flSetup, - LoadXMMRegs: flLoadXMMRegs + "\n" + opSrcLoadXMMRegs, - Add: flAdd, - ClampAndScale: flClampAndScale, - ConvertToInt32: flConvertToInt32, - Store4: opSrcStore4, - Store1: opSrcStore1, -}, { - LongName: "floatingAccumulateMask", - ShortName: "flAccMask", - FrameSize: flFrameSize, - ArgsSize: twoArgArgsSize, - Args: "dst []uint32, src []float32", - DstElemSize1: 1 * sizeOfUint32, - DstElemSize4: 4 * sizeOfUint32, - XMM3: flXMM3, - XMM4: flXMM4, - XMM5: flXMM5, - XMM6: maskXMM6, - XMM8: maskXMM8, - XMM9: maskXMM9, - XMM10: maskXMM10, - LoadArgs: twoArgLoadArgs, - Setup: flSetup, - LoadXMMRegs: flLoadXMMRegs + "\n" + maskLoadXMMRegs, - Add: flAdd, - ClampAndScale: flClampAndScale, - ConvertToInt32: flConvertToInt32, - Store4: maskStore4, - Store1: maskStore1, -}} - -const ( - fxFrameSize = `0` - flFrameSize = `8` - - oneArgArgsSize = `24` - twoArgArgsSize = `48` - - sizeOfUint8 = 1 - sizeOfUint32 = 4 - - fxXMM3 = `-` - flXMM3 = `flSignMask` - - fxXMM4 = `-` - flXMM4 = `flOne` - - fxXMM5 = `fxAlmost65536` - flXMM5 = `flAlmost65536` - - oneArgLoadArgs = ` - MOVQ buf_base+0(FP), DI - MOVQ buf_len+8(FP), BX - MOVQ buf_base+0(FP), SI - MOVQ buf_len+8(FP), R10 - ` - twoArgLoadArgs = ` - MOVQ dst_base+0(FP), DI - MOVQ dst_len+8(FP), BX - MOVQ src_base+24(FP), SI - MOVQ src_len+32(FP), R10 - // Sanity check that len(dst) >= len(src). - CMPQ BX, R10 - JLT {{.ShortName}}End - ` - - fxSetup = `` - flSetup = ` - // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is - // "Round To Zero". - STMXCSR mxcsrOrig-8(SP) - MOVL mxcsrOrig-8(SP), AX - ORL $0x6000, AX - MOVL AX, mxcsrNew-4(SP) - ` - - fxLoadXMMRegs = ` - // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. - MOVOU fxAlmost65536<>(SB), X5 - ` - flLoadXMMRegs = ` - // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. - // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. - // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. - MOVOU flSignMask<>(SB), X3 - MOVOU flOne<>(SB), X4 - MOVOU flAlmost65536<>(SB), X5 - ` - - fxAdd = `PADDD` - flAdd = `ADDPS` - - fxClampAndScale = ` - // y = abs(x) - // y >>= 2 // Shift by 2*ϕ - 16. - // y = min(y, fxAlmost65536) - // - // pabsd %xmm1,%xmm2 - // psrld $0x2,%xmm2 - // pminud %xmm5,%xmm2 - // - // Hopefully we'll get these opcode mnemonics into the assembler for Go - // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but - // it's similar. - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 - BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 - BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 - ` - flClampAndScale = ` - // y = x & flSignMask - // y = min(y, flOne) - // y = mul(y, flAlmost65536) - MOVOU X3, X2 - ANDPS X1, X2 - MINPS X4, X2 - MULPS X5, X2 - ` - - fxConvertToInt32 = ` - // z = convertToInt32(y) - // No-op. - ` - flConvertToInt32 = ` - // z = convertToInt32(y) - LDMXCSR mxcsrNew-4(SP) - CVTPS2PL X2, X2 - LDMXCSR mxcsrOrig-8(SP) - ` - - opOverStore4 = ` - // Blend over the dst's prior value. SIMD for i in 0..3: - // - // dstA := uint32(dst[i]) * 0x101 - // maskA := z@i - // outA := dstA*(0xffff-maskA)/0xffff + maskA - // dst[i] = uint8(outA >> 8) - // - // First, set X0 to dstA*(0xfff-maskA). - MOVL (DI), X0 - PSHUFB X8, X0 - MOVOU X9, X11 - PSUBL X2, X11 - PMULLD X11, X0 - // We implement uint32 division by 0xffff as multiplication by a magic - // constant (0x800080001) and then a shift by a magic constant (47). - // See TestDivideByFFFF for a justification. - // - // That multiplication widens from uint32 to uint64, so we have to - // duplicate and shift our four uint32s from one XMM register (X0) to - // two XMM registers (X0 and X11). - // - // Move the second and fourth uint32s in X0 to be the first and third - // uint32s in X11. - MOVOU X0, X11 - PSRLQ $32, X11 - // Multiply by magic, shift by magic. - // - // pmuludq %xmm10,%xmm0 - // pmuludq %xmm10,%xmm11 - BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 - BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda - PSRLQ $47, X0 - PSRLQ $47, X11 - // Merge the two registers back to one, X11, and add maskA. - PSLLQ $32, X11 - XORPS X0, X11 - PADDD X11, X2 - // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes. - PSHUFB X6, X2 - MOVL X2, (DI) - ` - opSrcStore4 = ` - // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z) - // copy(dst[:4], low4BytesOf(z)) - PSHUFB X6, X2 - MOVL X2, (DI) - ` - maskStore4 = ` - // copy(dst[:4], z) - MOVOU X2, (DI) - ` - - opOverStore1 = ` - // Blend over the dst's prior value. - // - // dstA := uint32(dst[0]) * 0x101 - // maskA := z - // outA := dstA*(0xffff-maskA)/0xffff + maskA - // dst[0] = uint8(outA >> 8) - MOVBLZX (DI), R12 - IMULL $0x101, R12 - MOVL X2, R13 - MOVL $0xffff, AX - SUBL R13, AX - MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. - MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... - MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. - SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). - ADDL DX, R13 - SHRL $8, R13 - MOVB R13, (DI) - ` - opSrcStore1 = ` - // dst[0] = uint8(z>>8) - MOVL X2, BX - SHRL $8, BX - MOVB BX, (DI) - ` - maskStore1 = ` - // dst[0] = uint32(z) - MOVL X2, (DI) - ` - - opOverXMM6 = `gather` - opSrcXMM6 = `gather` - maskXMM6 = `-` - - opOverXMM8 = `scatterAndMulBy0x101` - opSrcXMM8 = `-` - maskXMM8 = `-` - - opOverXMM9 = `fxAlmost65536` - opSrcXMM9 = `-` - maskXMM9 = `-` - - opOverXMM10 = `inverseFFFF` - opSrcXMM10 = `-` - maskXMM10 = `-` - - opOverLoadXMMRegs = ` - // gather := XMM(see above) // PSHUFB shuffle mask. - // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. - // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. - // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. - MOVOU gather<>(SB), X6 - MOVOU scatterAndMulBy0x101<>(SB), X8 - MOVOU fxAlmost65536<>(SB), X9 - MOVOU inverseFFFF<>(SB), X10 - ` - opSrcLoadXMMRegs = ` - // gather := XMM(see above) // PSHUFB shuffle mask. - MOVOU gather<>(SB), X6 - ` - maskLoadXMMRegs = `` -) diff --git a/vendor/golang.org/x/image/vector/gen_acc_amd64.s.tmpl b/vendor/golang.org/x/image/vector/gen_acc_amd64.s.tmpl deleted file mode 100644 index 66b21a1..0000000 --- a/vendor/golang.org/x/image/vector/gen_acc_amd64.s.tmpl +++ /dev/null @@ -1,171 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !appengine -// +build gc -// +build go1.6 -// +build !noasm - -#include "textflag.h" - -// fl is short for floating point math. fx is short for fixed point math. - -DATA flAlmost65536<>+0x00(SB)/8, $0x477fffff477fffff -DATA flAlmost65536<>+0x08(SB)/8, $0x477fffff477fffff -DATA flOne<>+0x00(SB)/8, $0x3f8000003f800000 -DATA flOne<>+0x08(SB)/8, $0x3f8000003f800000 -DATA flSignMask<>+0x00(SB)/8, $0x7fffffff7fffffff -DATA flSignMask<>+0x08(SB)/8, $0x7fffffff7fffffff - -// scatterAndMulBy0x101 is a PSHUFB mask that brings the low four bytes of an -// XMM register to the low byte of that register's four uint32 values. It -// duplicates those bytes, effectively multiplying each uint32 by 0x101. -// -// It transforms a little-endian 16-byte XMM value from -// ijkl???????????? -// to -// ii00jj00kk00ll00 -DATA scatterAndMulBy0x101<>+0x00(SB)/8, $0x8080010180800000 -DATA scatterAndMulBy0x101<>+0x08(SB)/8, $0x8080030380800202 - -// gather is a PSHUFB mask that brings the second-lowest byte of the XMM -// register's four uint32 values to the low four bytes of that register. -// -// It transforms a little-endian 16-byte XMM value from -// ?i???j???k???l?? -// to -// ijkl000000000000 -DATA gather<>+0x00(SB)/8, $0x808080800d090501 -DATA gather<>+0x08(SB)/8, $0x8080808080808080 - -DATA fxAlmost65536<>+0x00(SB)/8, $0x0000ffff0000ffff -DATA fxAlmost65536<>+0x08(SB)/8, $0x0000ffff0000ffff -DATA inverseFFFF<>+0x00(SB)/8, $0x8000800180008001 -DATA inverseFFFF<>+0x08(SB)/8, $0x8000800180008001 - -GLOBL flAlmost65536<>(SB), (NOPTR+RODATA), $16 -GLOBL flOne<>(SB), (NOPTR+RODATA), $16 -GLOBL flSignMask<>(SB), (NOPTR+RODATA), $16 -GLOBL scatterAndMulBy0x101<>(SB), (NOPTR+RODATA), $16 -GLOBL gather<>(SB), (NOPTR+RODATA), $16 -GLOBL fxAlmost65536<>(SB), (NOPTR+RODATA), $16 -GLOBL inverseFFFF<>(SB), (NOPTR+RODATA), $16 - -// func haveSSE4_1() bool -TEXT ·haveSSE4_1(SB), NOSPLIT, $0 - MOVQ $1, AX - CPUID - SHRQ $19, CX - ANDQ $1, CX - MOVB CX, ret+0(FP) - RET - -// ---------------------------------------------------------------------------- - -// func {{.LongName}}SIMD({{.Args}}) -// -// XMM registers. Variable names are per -// https://github.com/google/font-rs/blob/master/src/accumulate.c -// -// xmm0 scratch -// xmm1 x -// xmm2 y, z -// xmm3 {{.XMM3}} -// xmm4 {{.XMM4}} -// xmm5 {{.XMM5}} -// xmm6 {{.XMM6}} -// xmm7 offset -// xmm8 {{.XMM8}} -// xmm9 {{.XMM9}} -// xmm10 {{.XMM10}} -TEXT ·{{.LongName}}SIMD(SB), NOSPLIT, ${{.FrameSize}}-{{.ArgsSize}} - {{.LoadArgs}} - - // R10 = len(src) &^ 3 - // R11 = len(src) - MOVQ R10, R11 - ANDQ $-4, R10 - - {{.Setup}} - - {{.LoadXMMRegs}} - - // offset := XMM(0x00000000 repeated four times) // Cumulative sum. - XORPS X7, X7 - - // i := 0 - MOVQ $0, R9 - -{{.ShortName}}Loop4: - // for i < (len(src) &^ 3) - CMPQ R9, R10 - JAE {{.ShortName}}Loop1 - - // x = XMM(s0, s1, s2, s3) - // - // Where s0 is src[i+0], s1 is src[i+1], etc. - MOVOU (SI), X1 - - // scratch = XMM(0, s0, s1, s2) - // x += scratch // yields x == XMM(s0, s0+s1, s1+s2, s2+s3) - MOVOU X1, X0 - PSLLO $4, X0 - {{.Add}} X0, X1 - - // scratch = XMM(0, 0, 0, 0) - // scratch = XMM(scratch@0, scratch@0, x@0, x@1) // yields scratch == XMM(0, 0, s0, s0+s1) - // x += scratch // yields x == XMM(s0, s0+s1, s0+s1+s2, s0+s1+s2+s3) - XORPS X0, X0 - SHUFPS $0x40, X1, X0 - {{.Add}} X0, X1 - - // x += offset - {{.Add}} X7, X1 - - {{.ClampAndScale}} - - {{.ConvertToInt32}} - - {{.Store4}} - - // offset = XMM(x@3, x@3, x@3, x@3) - MOVOU X1, X7 - SHUFPS $0xff, X1, X7 - - // i += 4 - // dst = dst[4:] - // src = src[4:] - ADDQ $4, R9 - ADDQ ${{.DstElemSize4}}, DI - ADDQ $16, SI - JMP {{.ShortName}}Loop4 - -{{.ShortName}}Loop1: - // for i < len(src) - CMPQ R9, R11 - JAE {{.ShortName}}End - - // x = src[i] + offset - MOVL (SI), X1 - {{.Add}} X7, X1 - - {{.ClampAndScale}} - - {{.ConvertToInt32}} - - {{.Store1}} - - // offset = x - MOVOU X1, X7 - - // i += 1 - // dst = dst[1:] - // src = src[1:] - ADDQ $1, R9 - ADDQ ${{.DstElemSize1}}, DI - ADDQ $4, SI - JMP {{.ShortName}}Loop1 - -{{.ShortName}}End: - RET diff --git a/vendor/golang.org/x/image/vector/raster_fixed.go b/vendor/golang.org/x/image/vector/raster_fixed.go deleted file mode 100644 index 5b0fe7a..0000000 --- a/vendor/golang.org/x/image/vector/raster_fixed.go +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package vector - -// This file contains a fixed point math implementation of the vector -// graphics rasterizer. - -const ( - // ϕ is the number of binary digits after the fixed point. - // - // For example, if ϕ == 10 (and int1ϕ is based on the int32 type) then we - // are using 22.10 fixed point math. - // - // When changing this number, also change the assembly code (search for ϕ - // in the .s files). - ϕ = 9 - - fxOne int1ϕ = 1 << ϕ - fxOneAndAHalf int1ϕ = 1<<ϕ + 1<<(ϕ-1) - fxOneMinusIota int1ϕ = 1<<ϕ - 1 // Used for rounding up. -) - -// int1ϕ is a signed fixed-point number with 1*ϕ binary digits after the fixed -// point. -type int1ϕ int32 - -// int2ϕ is a signed fixed-point number with 2*ϕ binary digits after the fixed -// point. -// -// The Rasterizer's bufU32 field, nominally of type []uint32 (since that slice -// is also used by other code), can be thought of as a []int2ϕ during the -// fixedLineTo method. Lines of code that are actually like: -// buf[i] += uint32(etc) // buf has type []uint32. -// can be thought of as -// buf[i] += int2ϕ(etc) // buf has type []int2ϕ. -type int2ϕ int32 - -func fixedMax(x, y int1ϕ) int1ϕ { - if x > y { - return x - } - return y -} - -func fixedMin(x, y int1ϕ) int1ϕ { - if x < y { - return x - } - return y -} - -func fixedFloor(x int1ϕ) int32 { return int32(x >> ϕ) } -func fixedCeil(x int1ϕ) int32 { return int32((x + fxOneMinusIota) >> ϕ) } - -func (z *Rasterizer) fixedLineTo(bx, by float32) { - ax, ay := z.penX, z.penY - z.penX, z.penY = bx, by - dir := int1ϕ(1) - if ay > by { - dir, ax, ay, bx, by = -1, bx, by, ax, ay - } - // Horizontal line segments yield no change in coverage. Almost horizontal - // segments would yield some change, in ideal math, but the computation - // further below, involving 1 / (by - ay), is unstable in fixed point math, - // so we treat the segment as if it was perfectly horizontal. - if by-ay <= 0.000001 { - return - } - dxdy := (bx - ax) / (by - ay) - - ayϕ := int1ϕ(ay * float32(fxOne)) - byϕ := int1ϕ(by * float32(fxOne)) - - x := int1ϕ(ax * float32(fxOne)) - y := fixedFloor(ayϕ) - yMax := fixedCeil(byϕ) - if yMax > int32(z.size.Y) { - yMax = int32(z.size.Y) - } - width := int32(z.size.X) - - for ; y < yMax; y++ { - dy := fixedMin(int1ϕ(y+1)<<ϕ, byϕ) - fixedMax(int1ϕ(y)<<ϕ, ayϕ) - xNext := x + int1ϕ(float32(dy)*dxdy) - if y < 0 { - x = xNext - continue - } - buf := z.bufU32[y*width:] - d := dy * dir // d ranges up to ±1<<(1*ϕ). - x0, x1 := x, xNext - if x > xNext { - x0, x1 = x1, x0 - } - x0i := fixedFloor(x0) - x0Floor := int1ϕ(x0i) << ϕ - x1i := fixedCeil(x1) - x1Ceil := int1ϕ(x1i) << ϕ - - if x1i <= x0i+1 { - xmf := (x+xNext)>>1 - x0Floor - if i := clamp(x0i+0, width); i < uint(len(buf)) { - buf[i] += uint32(d * (fxOne - xmf)) - } - if i := clamp(x0i+1, width); i < uint(len(buf)) { - buf[i] += uint32(d * xmf) - } - } else { - oneOverS := x1 - x0 - twoOverS := 2 * oneOverS - x0f := x0 - x0Floor - oneMinusX0f := fxOne - x0f - oneMinusX0fSquared := oneMinusX0f * oneMinusX0f - x1f := x1 - x1Ceil + fxOne - x1fSquared := x1f * x1f - - // These next two variables are unused, as rounding errors are - // minimized when we delay the division by oneOverS for as long as - // possible. These lines of code (and the "In ideal math" comments - // below) are commented out instead of deleted in order to aid the - // comparison with the floating point version of the rasterizer. - // - // a0 := ((oneMinusX0f * oneMinusX0f) >> 1) / oneOverS - // am := ((x1f * x1f) >> 1) / oneOverS - - if i := clamp(x0i, width); i < uint(len(buf)) { - // In ideal math: buf[i] += uint32(d * a0) - D := oneMinusX0fSquared // D ranges up to ±1<<(2*ϕ). - D *= d // D ranges up to ±1<<(3*ϕ). - D /= twoOverS - buf[i] += uint32(D) - } - - if x1i == x0i+2 { - if i := clamp(x0i+1, width); i < uint(len(buf)) { - // In ideal math: buf[i] += uint32(d * (fxOne - a0 - am)) - // - // (x1i == x0i+2) and (twoOverS == 2 * (x1 - x0)) implies - // that twoOverS ranges up to +1<<(1*ϕ+2). - D := twoOverS<<ϕ - oneMinusX0fSquared - x1fSquared // D ranges up to ±1<<(2*ϕ+2). - D *= d // D ranges up to ±1<<(3*ϕ+2). - D /= twoOverS - buf[i] += uint32(D) - } - } else { - // This is commented out for the same reason as a0 and am. - // - // a1 := ((fxOneAndAHalf - x0f) << ϕ) / oneOverS - - if i := clamp(x0i+1, width); i < uint(len(buf)) { - // In ideal math: - // buf[i] += uint32(d * (a1 - a0)) - // or equivalently (but better in non-ideal, integer math, - // with respect to rounding errors), - // buf[i] += uint32(A * d / twoOverS) - // where - // A = (a1 - a0) * twoOverS - // = a1*twoOverS - a0*twoOverS - // Noting that twoOverS/oneOverS equals 2, substituting for - // a0 and then a1, given above, yields: - // A = a1*twoOverS - oneMinusX0fSquared - // = (fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared - // = fxOneAndAHalf<<(ϕ+1) - x0f<<(ϕ+1) - oneMinusX0fSquared - // - // This is a positive number minus two non-negative - // numbers. For an upper bound on A, the positive number is - // P = fxOneAndAHalf<<(ϕ+1) - // < (2*fxOne)<<(ϕ+1) - // = fxOne<<(ϕ+2) - // = 1<<(2*ϕ+2) - // - // For a lower bound on A, the two non-negative numbers are - // N = x0f<<(ϕ+1) + oneMinusX0fSquared - // ≤ x0f<<(ϕ+1) + fxOne*fxOne - // = x0f<<(ϕ+1) + 1<<(2*ϕ) - // < x0f<<(ϕ+1) + 1<<(2*ϕ+1) - // ≤ fxOne<<(ϕ+1) + 1<<(2*ϕ+1) - // = 1<<(2*ϕ+1) + 1<<(2*ϕ+1) - // = 1<<(2*ϕ+2) - // - // Thus, A ranges up to ±1<<(2*ϕ+2). It is possible to - // derive a tighter bound, but this bound is sufficient to - // reason about overflow. - D := (fxOneAndAHalf-x0f)<<(ϕ+1) - oneMinusX0fSquared // D ranges up to ±1<<(2*ϕ+2). - D *= d // D ranges up to ±1<<(3*ϕ+2). - D /= twoOverS - buf[i] += uint32(D) - } - dTimesS := uint32((d << (2 * ϕ)) / oneOverS) - for xi := x0i + 2; xi < x1i-1; xi++ { - if i := clamp(xi, width); i < uint(len(buf)) { - buf[i] += dTimesS - } - } - - // This is commented out for the same reason as a0 and am. - // - // a2 := a1 + (int1ϕ(x1i-x0i-3)<<(2*ϕ))/oneOverS - - if i := clamp(x1i-1, width); i < uint(len(buf)) { - // In ideal math: - // buf[i] += uint32(d * (fxOne - a2 - am)) - // or equivalently (but better in non-ideal, integer math, - // with respect to rounding errors), - // buf[i] += uint32(A * d / twoOverS) - // where - // A = (fxOne - a2 - am) * twoOverS - // = twoOverS<<ϕ - a2*twoOverS - am*twoOverS - // Noting that twoOverS/oneOverS equals 2, substituting for - // am and then a2, given above, yields: - // A = twoOverS<<ϕ - a2*twoOverS - x1f*x1f - // = twoOverS<<ϕ - a1*twoOverS - (int1ϕ(x1i-x0i-3)<<(2*ϕ))*2 - x1f*x1f - // = twoOverS<<ϕ - a1*twoOverS - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f - // Substituting for a1, given above, yields: - // A = twoOverS<<ϕ - ((fxOneAndAHalf-x0f)<<ϕ)*2 - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f - // = twoOverS<<ϕ - (fxOneAndAHalf-x0f)<<(ϕ+1) - int1ϕ(x1i-x0i-3)<<(2*ϕ+1) - x1f*x1f - // = B<<ϕ - x1f*x1f - // where - // B = twoOverS - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1) - // = (x1-x0)<<1 - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1) - // - // Re-arranging the defintions given above: - // x0Floor := int1ϕ(x0i) << ϕ - // x0f := x0 - x0Floor - // x1Ceil := int1ϕ(x1i) << ϕ - // x1f := x1 - x1Ceil + fxOne - // combined with fxOne = 1<<ϕ yields: - // x0 = x0f + int1ϕ(x0i)<<ϕ - // x1 = x1f + int1ϕ(x1i-1)<<ϕ - // so that expanding (x1-x0) yields: - // B = (x1f-x0f + int1ϕ(x1i-x0i-1)<<ϕ)<<1 - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1) - // = (x1f-x0f)<<1 + int1ϕ(x1i-x0i-1)<<(ϕ+1) - (fxOneAndAHalf-x0f)<<1 - int1ϕ(x1i-x0i-3)<<(ϕ+1) - // A large part of the second and fourth terms cancel: - // B = (x1f-x0f)<<1 - (fxOneAndAHalf-x0f)<<1 - int1ϕ(-2)<<(ϕ+1) - // = (x1f-x0f)<<1 - (fxOneAndAHalf-x0f)<<1 + 1<<(ϕ+2) - // = (x1f - fxOneAndAHalf)<<1 + 1<<(ϕ+2) - // The first term, (x1f - fxOneAndAHalf)<<1, is a negative - // number, bounded below by -fxOneAndAHalf<<1, which is - // greater than -fxOne<<2, or -1<<(ϕ+2). Thus, B ranges up - // to ±1<<(ϕ+2). One final simplification: - // B = x1f<<1 + (1<<(ϕ+2) - fxOneAndAHalf<<1) - const C = 1<<(ϕ+2) - fxOneAndAHalf<<1 - D := x1f<<1 + C // D ranges up to ±1<<(1*ϕ+2). - D <<= ϕ // D ranges up to ±1<<(2*ϕ+2). - D -= x1fSquared // D ranges up to ±1<<(2*ϕ+3). - D *= d // D ranges up to ±1<<(3*ϕ+3). - D /= twoOverS - buf[i] += uint32(D) - } - } - - if i := clamp(x1i, width); i < uint(len(buf)) { - // In ideal math: buf[i] += uint32(d * am) - D := x1fSquared // D ranges up to ±1<<(2*ϕ). - D *= d // D ranges up to ±1<<(3*ϕ). - D /= twoOverS - buf[i] += uint32(D) - } - } - - x = xNext - } -} - -func fixedAccumulateOpOver(dst []uint8, src []uint32) { - // Sanity check that len(dst) >= len(src). - if len(dst) < len(src) { - return - } - - acc := int2ϕ(0) - for i, v := range src { - acc += int2ϕ(v) - a := acc - if a < 0 { - a = -a - } - a >>= 2*ϕ - 16 - if a > 0xffff { - a = 0xffff - } - // This algorithm comes from the standard library's image/draw package. - dstA := uint32(dst[i]) * 0x101 - maskA := uint32(a) - outA := dstA*(0xffff-maskA)/0xffff + maskA - dst[i] = uint8(outA >> 8) - } -} - -func fixedAccumulateOpSrc(dst []uint8, src []uint32) { - // Sanity check that len(dst) >= len(src). - if len(dst) < len(src) { - return - } - - acc := int2ϕ(0) - for i, v := range src { - acc += int2ϕ(v) - a := acc - if a < 0 { - a = -a - } - a >>= 2*ϕ - 8 - if a > 0xff { - a = 0xff - } - dst[i] = uint8(a) - } -} - -func fixedAccumulateMask(buf []uint32) { - acc := int2ϕ(0) - for i, v := range buf { - acc += int2ϕ(v) - a := acc - if a < 0 { - a = -a - } - a >>= 2*ϕ - 16 - if a > 0xffff { - a = 0xffff - } - buf[i] = uint32(a) - } -} diff --git a/vendor/golang.org/x/image/vector/raster_floating.go b/vendor/golang.org/x/image/vector/raster_floating.go deleted file mode 100644 index fd11db1..0000000 --- a/vendor/golang.org/x/image/vector/raster_floating.go +++ /dev/null @@ -1,220 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package vector - -// This file contains a floating point math implementation of the vector -// graphics rasterizer. - -import ( - "math" -) - -func floatingMax(x, y float32) float32 { - if x > y { - return x - } - return y -} - -func floatingMin(x, y float32) float32 { - if x < y { - return x - } - return y -} - -func floatingFloor(x float32) int32 { return int32(math.Floor(float64(x))) } -func floatingCeil(x float32) int32 { return int32(math.Ceil(float64(x))) } - -func (z *Rasterizer) floatingLineTo(bx, by float32) { - ax, ay := z.penX, z.penY - z.penX, z.penY = bx, by - dir := float32(1) - if ay > by { - dir, ax, ay, bx, by = -1, bx, by, ax, ay - } - // Horizontal line segments yield no change in coverage. Almost horizontal - // segments would yield some change, in ideal math, but the computation - // further below, involving 1 / (by - ay), is unstable in floating point - // math, so we treat the segment as if it was perfectly horizontal. - if by-ay <= 0.000001 { - return - } - dxdy := (bx - ax) / (by - ay) - - x := ax - y := floatingFloor(ay) - yMax := floatingCeil(by) - if yMax > int32(z.size.Y) { - yMax = int32(z.size.Y) - } - width := int32(z.size.X) - - for ; y < yMax; y++ { - dy := floatingMin(float32(y+1), by) - floatingMax(float32(y), ay) - - // The "float32" in expressions like "float32(foo*bar)" here and below - // look redundant, since foo and bar already have type float32, but are - // explicit in order to disable the compiler's Fused Multiply Add (FMA) - // instruction selection, which can improve performance but can result - // in different rounding errors in floating point computations. - // - // This package aims to have bit-exact identical results across all - // GOARCHes, and across pure Go code and assembly, so it disables FMA. - // - // See the discussion at - // https://groups.google.com/d/topic/golang-dev/Sti0bl2xUXQ/discussion - xNext := x + float32(dy*dxdy) - if y < 0 { - x = xNext - continue - } - buf := z.bufF32[y*width:] - d := float32(dy * dir) - x0, x1 := x, xNext - if x > xNext { - x0, x1 = x1, x0 - } - x0i := floatingFloor(x0) - x0Floor := float32(x0i) - x1i := floatingCeil(x1) - x1Ceil := float32(x1i) - - if x1i <= x0i+1 { - xmf := float32(0.5*(x+xNext)) - x0Floor - if i := clamp(x0i+0, width); i < uint(len(buf)) { - buf[i] += d - float32(d*xmf) - } - if i := clamp(x0i+1, width); i < uint(len(buf)) { - buf[i] += float32(d * xmf) - } - } else { - s := 1 / (x1 - x0) - x0f := x0 - x0Floor - oneMinusX0f := 1 - x0f - a0 := float32(0.5 * s * oneMinusX0f * oneMinusX0f) - x1f := x1 - x1Ceil + 1 - am := float32(0.5 * s * x1f * x1f) - - if i := clamp(x0i, width); i < uint(len(buf)) { - buf[i] += float32(d * a0) - } - - if x1i == x0i+2 { - if i := clamp(x0i+1, width); i < uint(len(buf)) { - buf[i] += float32(d * (1 - a0 - am)) - } - } else { - a1 := float32(s * (1.5 - x0f)) - if i := clamp(x0i+1, width); i < uint(len(buf)) { - buf[i] += float32(d * (a1 - a0)) - } - dTimesS := float32(d * s) - for xi := x0i + 2; xi < x1i-1; xi++ { - if i := clamp(xi, width); i < uint(len(buf)) { - buf[i] += dTimesS - } - } - a2 := a1 + float32(s*float32(x1i-x0i-3)) - if i := clamp(x1i-1, width); i < uint(len(buf)) { - buf[i] += float32(d * (1 - a2 - am)) - } - } - - if i := clamp(x1i, width); i < uint(len(buf)) { - buf[i] += float32(d * am) - } - } - - x = xNext - } -} - -const ( - // almost256 scales a floating point value in the range [0, 1] to a uint8 - // value in the range [0x00, 0xff]. - // - // 255 is too small. Floating point math accumulates rounding errors, so a - // fully covered src value that would in ideal math be float32(1) might be - // float32(1-ε), and uint8(255 * (1-ε)) would be 0xfe instead of 0xff. The - // uint8 conversion rounds to zero, not to nearest. - // - // 256 is too big. If we multiplied by 256, below, then a fully covered src - // value of float32(1) would translate to uint8(256 * 1), which can be 0x00 - // instead of the maximal value 0xff. - // - // math.Float32bits(almost256) is 0x437fffff. - almost256 = 255.99998 - - // almost65536 scales a floating point value in the range [0, 1] to a - // uint16 value in the range [0x0000, 0xffff]. - // - // math.Float32bits(almost65536) is 0x477fffff. - almost65536 = almost256 * 256 -) - -func floatingAccumulateOpOver(dst []uint8, src []float32) { - // Sanity check that len(dst) >= len(src). - if len(dst) < len(src) { - return - } - - acc := float32(0) - for i, v := range src { - acc += v - a := acc - if a < 0 { - a = -a - } - if a > 1 { - a = 1 - } - // This algorithm comes from the standard library's image/draw package. - dstA := uint32(dst[i]) * 0x101 - maskA := uint32(almost65536 * a) - outA := dstA*(0xffff-maskA)/0xffff + maskA - dst[i] = uint8(outA >> 8) - } -} - -func floatingAccumulateOpSrc(dst []uint8, src []float32) { - // Sanity check that len(dst) >= len(src). - if len(dst) < len(src) { - return - } - - acc := float32(0) - for i, v := range src { - acc += v - a := acc - if a < 0 { - a = -a - } - if a > 1 { - a = 1 - } - dst[i] = uint8(almost256 * a) - } -} - -func floatingAccumulateMask(dst []uint32, src []float32) { - // Sanity check that len(dst) >= len(src). - if len(dst) < len(src) { - return - } - - acc := float32(0) - for i, v := range src { - acc += v - a := acc - if a < 0 { - a = -a - } - if a > 1 { - a = 1 - } - dst[i] = uint32(almost65536 * a) - } -} diff --git a/vendor/golang.org/x/image/vector/vector.go b/vendor/golang.org/x/image/vector/vector.go deleted file mode 100644 index 852a4f8..0000000 --- a/vendor/golang.org/x/image/vector/vector.go +++ /dev/null @@ -1,472 +0,0 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:generate go run gen.go -//go:generate asmfmt -w acc_amd64.s - -// asmfmt is https://github.com/klauspost/asmfmt - -// Package vector provides a rasterizer for 2-D vector graphics. -package vector // import "golang.org/x/image/vector" - -// The rasterizer's design follows -// https://medium.com/@raphlinus/inside-the-fastest-font-renderer-in-the-world-75ae5270c445 -// -// Proof of concept code is in -// https://github.com/google/font-go -// -// See also: -// http://nothings.org/gamedev/rasterize/ -// http://projects.tuxee.net/cl-vectors/section-the-cl-aa-algorithm -// https://people.gnome.org/~mathieu/libart/internals.html#INTERNALS-SCANLINE - -import ( - "image" - "image/color" - "image/draw" - "math" -) - -// floatingPointMathThreshold is the width or height above which the rasterizer -// chooses to used floating point math instead of fixed point math. -// -// Both implementations of line segmentation rasterization (see raster_fixed.go -// and raster_floating.go) implement the same algorithm (in ideal, infinite -// precision math) but they perform differently in practice. The fixed point -// math version is roughtly 1.25x faster (on GOARCH=amd64) on the benchmarks, -// but at sufficiently large scales, the computations will overflow and hence -// show rendering artifacts. The floating point math version has more -// consistent quality over larger scales, but it is significantly slower. -// -// This constant determines when to use the faster implementation and when to -// use the better quality implementation. -// -// The rationale for this particular value is that TestRasterizePolygon in -// vector_test.go checks the rendering quality of polygon edges at various -// angles, inscribed in a circle of diameter 512. It may be that a higher value -// would still produce acceptable quality, but 512 seems to work. -const floatingPointMathThreshold = 512 - -func lerp(t, px, py, qx, qy float32) (x, y float32) { - return px + t*(qx-px), py + t*(qy-py) -} - -func clamp(i, width int32) uint { - if i < 0 { - return 0 - } - if i < width { - return uint(i) - } - return uint(width) -} - -// NewRasterizer returns a new Rasterizer whose rendered mask image is bounded -// by the given width and height. -func NewRasterizer(w, h int) *Rasterizer { - z := &Rasterizer{} - z.Reset(w, h) - return z -} - -// Raster is a 2-D vector graphics rasterizer. -// -// The zero value is usable, in that it is a Rasterizer whose rendered mask -// image has zero width and zero height. Call Reset to change its bounds. -type Rasterizer struct { - // bufXxx are buffers of float32 or uint32 values, holding either the - // individual or cumulative area values. - // - // We don't actually need both values at any given time, and to conserve - // memory, the integration of the individual to the cumulative could modify - // the buffer in place. In other words, we could use a single buffer, say - // of type []uint32, and add some math.Float32bits and math.Float32frombits - // calls to satisfy the compiler's type checking. As of Go 1.7, though, - // there is a performance penalty between: - // bufF32[i] += x - // and - // bufU32[i] = math.Float32bits(x + math.Float32frombits(bufU32[i])) - // - // See golang.org/issue/17220 for some discussion. - bufF32 []float32 - bufU32 []uint32 - - useFloatingPointMath bool - - size image.Point - firstX float32 - firstY float32 - penX float32 - penY float32 - - // DrawOp is the operator used for the Draw method. - // - // The zero value is draw.Over. - DrawOp draw.Op - - // TODO: an exported field equivalent to the mask point in the - // draw.DrawMask function in the stdlib image/draw package? -} - -// Reset resets a Rasterizer as if it was just returned by NewRasterizer. -// -// This includes setting z.DrawOp to draw.Over. -func (z *Rasterizer) Reset(w, h int) { - z.size = image.Point{w, h} - z.firstX = 0 - z.firstY = 0 - z.penX = 0 - z.penY = 0 - z.DrawOp = draw.Over - - z.setUseFloatingPointMath(w > floatingPointMathThreshold || h > floatingPointMathThreshold) -} - -func (z *Rasterizer) setUseFloatingPointMath(b bool) { - z.useFloatingPointMath = b - - // Make z.bufF32 or z.bufU32 large enough to hold width * height samples. - if z.useFloatingPointMath { - if n := z.size.X * z.size.Y; n > cap(z.bufF32) { - z.bufF32 = make([]float32, n) - } else { - z.bufF32 = z.bufF32[:n] - for i := range z.bufF32 { - z.bufF32[i] = 0 - } - } - } else { - if n := z.size.X * z.size.Y; n > cap(z.bufU32) { - z.bufU32 = make([]uint32, n) - } else { - z.bufU32 = z.bufU32[:n] - for i := range z.bufU32 { - z.bufU32[i] = 0 - } - } - } -} - -// Size returns the width and height passed to NewRasterizer or Reset. -func (z *Rasterizer) Size() image.Point { - return z.size -} - -// Bounds returns the rectangle from (0, 0) to the width and height passed to -// NewRasterizer or Reset. -func (z *Rasterizer) Bounds() image.Rectangle { - return image.Rectangle{Max: z.size} -} - -// Pen returns the location of the path-drawing pen: the last argument to the -// most recent XxxTo call. -func (z *Rasterizer) Pen() (x, y float32) { - return z.penX, z.penY -} - -// ClosePath closes the current path. -func (z *Rasterizer) ClosePath() { - z.LineTo(z.firstX, z.firstY) -} - -// MoveTo starts a new path and moves the pen to (ax, ay). -// -// The coordinates are allowed to be out of the Rasterizer's bounds. -func (z *Rasterizer) MoveTo(ax, ay float32) { - z.firstX = ax - z.firstY = ay - z.penX = ax - z.penY = ay -} - -// LineTo adds a line segment, from the pen to (bx, by), and moves the pen to -// (bx, by). -// -// The coordinates are allowed to be out of the Rasterizer's bounds. -func (z *Rasterizer) LineTo(bx, by float32) { - if z.useFloatingPointMath { - z.floatingLineTo(bx, by) - } else { - z.fixedLineTo(bx, by) - } -} - -// QuadTo adds a quadratic Bézier segment, from the pen via (bx, by) to (cx, -// cy), and moves the pen to (cx, cy). -// -// The coordinates are allowed to be out of the Rasterizer's bounds. -func (z *Rasterizer) QuadTo(bx, by, cx, cy float32) { - ax, ay := z.penX, z.penY - devsq := devSquared(ax, ay, bx, by, cx, cy) - if devsq >= 0.333 { - const tol = 3 - n := 1 + int(math.Sqrt(math.Sqrt(tol*float64(devsq)))) - t, nInv := float32(0), 1/float32(n) - for i := 0; i < n-1; i++ { - t += nInv - abx, aby := lerp(t, ax, ay, bx, by) - bcx, bcy := lerp(t, bx, by, cx, cy) - z.LineTo(lerp(t, abx, aby, bcx, bcy)) - } - } - z.LineTo(cx, cy) -} - -// CubeTo adds a cubic Bézier segment, from the pen via (bx, by) and (cx, cy) -// to (dx, dy), and moves the pen to (dx, dy). -// -// The coordinates are allowed to be out of the Rasterizer's bounds. -func (z *Rasterizer) CubeTo(bx, by, cx, cy, dx, dy float32) { - ax, ay := z.penX, z.penY - devsq := devSquared(ax, ay, bx, by, dx, dy) - if devsqAlt := devSquared(ax, ay, cx, cy, dx, dy); devsq < devsqAlt { - devsq = devsqAlt - } - if devsq >= 0.333 { - const tol = 3 - n := 1 + int(math.Sqrt(math.Sqrt(tol*float64(devsq)))) - t, nInv := float32(0), 1/float32(n) - for i := 0; i < n-1; i++ { - t += nInv - abx, aby := lerp(t, ax, ay, bx, by) - bcx, bcy := lerp(t, bx, by, cx, cy) - cdx, cdy := lerp(t, cx, cy, dx, dy) - abcx, abcy := lerp(t, abx, aby, bcx, bcy) - bcdx, bcdy := lerp(t, bcx, bcy, cdx, cdy) - z.LineTo(lerp(t, abcx, abcy, bcdx, bcdy)) - } - } - z.LineTo(dx, dy) -} - -// devSquared returns a measure of how curvy the sequence (ax, ay) to (bx, by) -// to (cx, cy) is. It determines how many line segments will approximate a -// Bézier curve segment. -// -// http://lists.nongnu.org/archive/html/freetype-devel/2016-08/msg00080.html -// gives the rationale for this evenly spaced heuristic instead of a recursive -// de Casteljau approach: -// -// The reason for the subdivision by n is that I expect the "flatness" -// computation to be semi-expensive (it's done once rather than on each -// potential subdivision) and also because you'll often get fewer subdivisions. -// Taking a circular arc as a simplifying assumption (ie a spherical cow), -// where I get n, a recursive approach would get 2^⌈lg n⌉, which, if I haven't -// made any horrible mistakes, is expected to be 33% more in the limit. -func devSquared(ax, ay, bx, by, cx, cy float32) float32 { - devx := ax - 2*bx + cx - devy := ay - 2*by + cy - return devx*devx + devy*devy -} - -// Draw implements the Drawer interface from the standard library's image/draw -// package. -// -// The vector paths previously added via the XxxTo calls become the mask for -// drawing src onto dst. -func (z *Rasterizer) Draw(dst draw.Image, r image.Rectangle, src image.Image, sp image.Point) { - // TODO: adjust r and sp (and mp?) if src.Bounds() doesn't contain - // r.Add(sp.Sub(r.Min)). - - if src, ok := src.(*image.Uniform); ok { - srcR, srcG, srcB, srcA := src.RGBA() - switch dst := dst.(type) { - case *image.Alpha: - // Fast path for glyph rendering. - if srcA == 0xffff { - if z.DrawOp == draw.Over { - z.rasterizeDstAlphaSrcOpaqueOpOver(dst, r) - } else { - z.rasterizeDstAlphaSrcOpaqueOpSrc(dst, r) - } - return - } - case *image.RGBA: - if z.DrawOp == draw.Over { - z.rasterizeDstRGBASrcUniformOpOver(dst, r, srcR, srcG, srcB, srcA) - } else { - z.rasterizeDstRGBASrcUniformOpSrc(dst, r, srcR, srcG, srcB, srcA) - } - return - } - } - - if z.DrawOp == draw.Over { - z.rasterizeOpOver(dst, r, src, sp) - } else { - z.rasterizeOpSrc(dst, r, src, sp) - } -} - -func (z *Rasterizer) accumulateMask() { - if z.useFloatingPointMath { - if n := z.size.X * z.size.Y; n > cap(z.bufU32) { - z.bufU32 = make([]uint32, n) - } else { - z.bufU32 = z.bufU32[:n] - } - if haveFloatingAccumulateSIMD { - floatingAccumulateMaskSIMD(z.bufU32, z.bufF32) - } else { - floatingAccumulateMask(z.bufU32, z.bufF32) - } - } else { - if haveFixedAccumulateSIMD { - fixedAccumulateMaskSIMD(z.bufU32) - } else { - fixedAccumulateMask(z.bufU32) - } - } -} - -func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpOver(dst *image.Alpha, r image.Rectangle) { - // TODO: non-zero vs even-odd winding? - if r == dst.Bounds() && r == z.Bounds() { - // We bypass the z.accumulateMask step and convert straight from - // z.bufF32 or z.bufU32 to dst.Pix. - if z.useFloatingPointMath { - if haveFloatingAccumulateSIMD { - floatingAccumulateOpOverSIMD(dst.Pix, z.bufF32) - } else { - floatingAccumulateOpOver(dst.Pix, z.bufF32) - } - } else { - if haveFixedAccumulateSIMD { - fixedAccumulateOpOverSIMD(dst.Pix, z.bufU32) - } else { - fixedAccumulateOpOver(dst.Pix, z.bufU32) - } - } - return - } - - z.accumulateMask() - pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):] - for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { - for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { - ma := z.bufU32[y*z.size.X+x] - i := y*dst.Stride + x - - // This formula is like rasterizeOpOver's, simplified for the - // concrete dst type and opaque src assumption. - a := 0xffff - ma - pix[i] = uint8((uint32(pix[i])*0x101*a/0xffff + ma) >> 8) - } - } -} - -func (z *Rasterizer) rasterizeDstAlphaSrcOpaqueOpSrc(dst *image.Alpha, r image.Rectangle) { - // TODO: non-zero vs even-odd winding? - if r == dst.Bounds() && r == z.Bounds() { - // We bypass the z.accumulateMask step and convert straight from - // z.bufF32 or z.bufU32 to dst.Pix. - if z.useFloatingPointMath { - if haveFloatingAccumulateSIMD { - floatingAccumulateOpSrcSIMD(dst.Pix, z.bufF32) - } else { - floatingAccumulateOpSrc(dst.Pix, z.bufF32) - } - } else { - if haveFixedAccumulateSIMD { - fixedAccumulateOpSrcSIMD(dst.Pix, z.bufU32) - } else { - fixedAccumulateOpSrc(dst.Pix, z.bufU32) - } - } - return - } - - z.accumulateMask() - pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):] - for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { - for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { - ma := z.bufU32[y*z.size.X+x] - - // This formula is like rasterizeOpSrc's, simplified for the - // concrete dst type and opaque src assumption. - pix[y*dst.Stride+x] = uint8(ma >> 8) - } - } -} - -func (z *Rasterizer) rasterizeDstRGBASrcUniformOpOver(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { - z.accumulateMask() - pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):] - for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { - for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { - ma := z.bufU32[y*z.size.X+x] - - // This formula is like rasterizeOpOver's, simplified for the - // concrete dst type and uniform src assumption. - a := 0xffff - (sa * ma / 0xffff) - i := y*dst.Stride + 4*x - pix[i+0] = uint8(((uint32(pix[i+0])*0x101*a + sr*ma) / 0xffff) >> 8) - pix[i+1] = uint8(((uint32(pix[i+1])*0x101*a + sg*ma) / 0xffff) >> 8) - pix[i+2] = uint8(((uint32(pix[i+2])*0x101*a + sb*ma) / 0xffff) >> 8) - pix[i+3] = uint8(((uint32(pix[i+3])*0x101*a + sa*ma) / 0xffff) >> 8) - } - } -} - -func (z *Rasterizer) rasterizeDstRGBASrcUniformOpSrc(dst *image.RGBA, r image.Rectangle, sr, sg, sb, sa uint32) { - z.accumulateMask() - pix := dst.Pix[dst.PixOffset(r.Min.X, r.Min.Y):] - for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { - for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { - ma := z.bufU32[y*z.size.X+x] - - // This formula is like rasterizeOpSrc's, simplified for the - // concrete dst type and uniform src assumption. - i := y*dst.Stride + 4*x - pix[i+0] = uint8((sr * ma / 0xffff) >> 8) - pix[i+1] = uint8((sg * ma / 0xffff) >> 8) - pix[i+2] = uint8((sb * ma / 0xffff) >> 8) - pix[i+3] = uint8((sa * ma / 0xffff) >> 8) - } - } -} - -func (z *Rasterizer) rasterizeOpOver(dst draw.Image, r image.Rectangle, src image.Image, sp image.Point) { - z.accumulateMask() - out := color.RGBA64{} - outc := color.Color(&out) - for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { - for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { - sr, sg, sb, sa := src.At(sp.X+x, sp.Y+y).RGBA() - ma := z.bufU32[y*z.size.X+x] - - // This algorithm comes from the standard library's image/draw - // package. - dr, dg, db, da := dst.At(r.Min.X+x, r.Min.Y+y).RGBA() - a := 0xffff - (sa * ma / 0xffff) - out.R = uint16((dr*a + sr*ma) / 0xffff) - out.G = uint16((dg*a + sg*ma) / 0xffff) - out.B = uint16((db*a + sb*ma) / 0xffff) - out.A = uint16((da*a + sa*ma) / 0xffff) - - dst.Set(r.Min.X+x, r.Min.Y+y, outc) - } - } -} - -func (z *Rasterizer) rasterizeOpSrc(dst draw.Image, r image.Rectangle, src image.Image, sp image.Point) { - z.accumulateMask() - out := color.RGBA64{} - outc := color.Color(&out) - for y, y1 := 0, r.Max.Y-r.Min.Y; y < y1; y++ { - for x, x1 := 0, r.Max.X-r.Min.X; x < x1; x++ { - sr, sg, sb, sa := src.At(sp.X+x, sp.Y+y).RGBA() - ma := z.bufU32[y*z.size.X+x] - - // This algorithm comes from the standard library's image/draw - // package. - out.R = uint16(sr * ma / 0xffff) - out.G = uint16(sg * ma / 0xffff) - out.B = uint16(sb * ma / 0xffff) - out.A = uint16(sa * ma / 0xffff) - - dst.Set(r.Min.X+x, r.Min.Y+y, outc) - } - } -} -- cgit v1.2.3