From e1e8d058a33f7566f9c565d04b0d8b56f9645c35 Mon Sep 17 00:00:00 2001 From: Dimitri Sokolyuk Date: Wed, 25 Apr 2018 09:28:54 +0200 Subject: add vendor --- vendor/golang.org/x/image/vector/gen.go | 447 ++++++++++++++++++++++++++++++++ 1 file changed, 447 insertions(+) create mode 100644 vendor/golang.org/x/image/vector/gen.go (limited to 'vendor/golang.org/x/image/vector/gen.go') diff --git a/vendor/golang.org/x/image/vector/gen.go b/vendor/golang.org/x/image/vector/gen.go new file mode 100644 index 0000000..28b298b --- /dev/null +++ b/vendor/golang.org/x/image/vector/gen.go @@ -0,0 +1,447 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build ignore + +package main + +import ( + "bytes" + "io/ioutil" + "log" + "strings" + "text/template" +) + +const ( + copyright = "" + + "// Copyright 2016 The Go Authors. All rights reserved.\n" + + "// Use of this source code is governed by a BSD-style\n" + + "// license that can be found in the LICENSE file.\n" + + doNotEdit = "// generated by go run gen.go; DO NOT EDIT\n" + + dashDashDash = "// --------" +) + +func main() { + tmpl, err := ioutil.ReadFile("gen_acc_amd64.s.tmpl") + if err != nil { + log.Fatalf("ReadFile: %v", err) + } + if !bytes.HasPrefix(tmpl, []byte(copyright)) { + log.Fatal("source template did not start with the copyright header") + } + tmpl = tmpl[len(copyright):] + + preamble := []byte(nil) + if i := bytes.Index(tmpl, []byte(dashDashDash)); i < 0 { + log.Fatalf("source template did not contain %q", dashDashDash) + } else { + preamble, tmpl = tmpl[:i], tmpl[i:] + } + + t, err := template.New("").Parse(string(tmpl)) + if err != nil { + log.Fatalf("Parse: %v", err) + } + + out := bytes.NewBuffer(nil) + out.WriteString(doNotEdit) + out.Write(preamble) + + for i, v := range instances { + if i != 0 { + out.WriteString("\n") + } + if strings.Contains(v.LoadArgs, "{{.ShortName}}") { + v.LoadArgs = strings.Replace(v.LoadArgs, "{{.ShortName}}", v.ShortName, -1) + } + if err := t.Execute(out, v); err != nil { + log.Fatalf("Execute(%q): %v", v.ShortName, err) + } + } + + if err := ioutil.WriteFile("acc_amd64.s", out.Bytes(), 0666); err != nil { + log.Fatalf("WriteFile: %v", err) + } +} + +var instances = []struct { + LongName string + ShortName string + FrameSize string + ArgsSize string + Args string + DstElemSize1 int + DstElemSize4 int + XMM3 string + XMM4 string + XMM5 string + XMM6 string + XMM8 string + XMM9 string + XMM10 string + LoadArgs string + Setup string + LoadXMMRegs string + Add string + ClampAndScale string + ConvertToInt32 string + Store4 string + Store1 string +}{{ + LongName: "fixedAccumulateOpOver", + ShortName: "fxAccOpOver", + FrameSize: fxFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []uint32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, + XMM3: fxXMM3, + XMM4: fxXMM4, + XMM5: fxXMM5, + XMM6: opOverXMM6, + XMM8: opOverXMM8, + XMM9: opOverXMM9, + XMM10: opOverXMM10, + LoadArgs: twoArgLoadArgs, + Setup: fxSetup, + LoadXMMRegs: fxLoadXMMRegs + "\n" + opOverLoadXMMRegs, + Add: fxAdd, + ClampAndScale: fxClampAndScale, + ConvertToInt32: fxConvertToInt32, + Store4: opOverStore4, + Store1: opOverStore1, +}, { + LongName: "fixedAccumulateOpSrc", + ShortName: "fxAccOpSrc", + FrameSize: fxFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []uint32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, + XMM3: fxXMM3, + XMM4: fxXMM4, + XMM5: fxXMM5, + XMM6: opSrcXMM6, + XMM8: opSrcXMM8, + XMM9: opSrcXMM9, + XMM10: opSrcXMM10, + LoadArgs: twoArgLoadArgs, + Setup: fxSetup, + LoadXMMRegs: fxLoadXMMRegs + "\n" + opSrcLoadXMMRegs, + Add: fxAdd, + ClampAndScale: fxClampAndScale, + ConvertToInt32: fxConvertToInt32, + Store4: opSrcStore4, + Store1: opSrcStore1, +}, { + LongName: "fixedAccumulateMask", + ShortName: "fxAccMask", + FrameSize: fxFrameSize, + ArgsSize: oneArgArgsSize, + Args: "buf []uint32", + DstElemSize1: 1 * sizeOfUint32, + DstElemSize4: 4 * sizeOfUint32, + XMM3: fxXMM3, + XMM4: fxXMM4, + XMM5: fxXMM5, + XMM6: maskXMM6, + XMM8: maskXMM8, + XMM9: maskXMM9, + XMM10: maskXMM10, + LoadArgs: oneArgLoadArgs, + Setup: fxSetup, + LoadXMMRegs: fxLoadXMMRegs + "\n" + maskLoadXMMRegs, + Add: fxAdd, + ClampAndScale: fxClampAndScale, + ConvertToInt32: fxConvertToInt32, + Store4: maskStore4, + Store1: maskStore1, +}, { + LongName: "floatingAccumulateOpOver", + ShortName: "flAccOpOver", + FrameSize: flFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []float32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, + XMM3: flXMM3, + XMM4: flXMM4, + XMM5: flXMM5, + XMM6: opOverXMM6, + XMM8: opOverXMM8, + XMM9: opOverXMM9, + XMM10: opOverXMM10, + LoadArgs: twoArgLoadArgs, + Setup: flSetup, + LoadXMMRegs: flLoadXMMRegs + "\n" + opOverLoadXMMRegs, + Add: flAdd, + ClampAndScale: flClampAndScale, + ConvertToInt32: flConvertToInt32, + Store4: opOverStore4, + Store1: opOverStore1, +}, { + LongName: "floatingAccumulateOpSrc", + ShortName: "flAccOpSrc", + FrameSize: flFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint8, src []float32", + DstElemSize1: 1 * sizeOfUint8, + DstElemSize4: 4 * sizeOfUint8, + XMM3: flXMM3, + XMM4: flXMM4, + XMM5: flXMM5, + XMM6: opSrcXMM6, + XMM8: opSrcXMM8, + XMM9: opSrcXMM9, + XMM10: opSrcXMM10, + LoadArgs: twoArgLoadArgs, + Setup: flSetup, + LoadXMMRegs: flLoadXMMRegs + "\n" + opSrcLoadXMMRegs, + Add: flAdd, + ClampAndScale: flClampAndScale, + ConvertToInt32: flConvertToInt32, + Store4: opSrcStore4, + Store1: opSrcStore1, +}, { + LongName: "floatingAccumulateMask", + ShortName: "flAccMask", + FrameSize: flFrameSize, + ArgsSize: twoArgArgsSize, + Args: "dst []uint32, src []float32", + DstElemSize1: 1 * sizeOfUint32, + DstElemSize4: 4 * sizeOfUint32, + XMM3: flXMM3, + XMM4: flXMM4, + XMM5: flXMM5, + XMM6: maskXMM6, + XMM8: maskXMM8, + XMM9: maskXMM9, + XMM10: maskXMM10, + LoadArgs: twoArgLoadArgs, + Setup: flSetup, + LoadXMMRegs: flLoadXMMRegs + "\n" + maskLoadXMMRegs, + Add: flAdd, + ClampAndScale: flClampAndScale, + ConvertToInt32: flConvertToInt32, + Store4: maskStore4, + Store1: maskStore1, +}} + +const ( + fxFrameSize = `0` + flFrameSize = `8` + + oneArgArgsSize = `24` + twoArgArgsSize = `48` + + sizeOfUint8 = 1 + sizeOfUint32 = 4 + + fxXMM3 = `-` + flXMM3 = `flSignMask` + + fxXMM4 = `-` + flXMM4 = `flOne` + + fxXMM5 = `fxAlmost65536` + flXMM5 = `flAlmost65536` + + oneArgLoadArgs = ` + MOVQ buf_base+0(FP), DI + MOVQ buf_len+8(FP), BX + MOVQ buf_base+0(FP), SI + MOVQ buf_len+8(FP), R10 + ` + twoArgLoadArgs = ` + MOVQ dst_base+0(FP), DI + MOVQ dst_len+8(FP), BX + MOVQ src_base+24(FP), SI + MOVQ src_len+32(FP), R10 + // Sanity check that len(dst) >= len(src). + CMPQ BX, R10 + JLT {{.ShortName}}End + ` + + fxSetup = `` + flSetup = ` + // Prepare to set MXCSR bits 13 and 14, so that the CVTPS2PL below is + // "Round To Zero". + STMXCSR mxcsrOrig-8(SP) + MOVL mxcsrOrig-8(SP), AX + ORL $0x6000, AX + MOVL AX, mxcsrNew-4(SP) + ` + + fxLoadXMMRegs = ` + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // Maximum of an uint16. + MOVOU fxAlmost65536<>(SB), X5 + ` + flLoadXMMRegs = ` + // flSignMask := XMM(0x7fffffff repeated four times) // All but the sign bit of a float32. + // flOne := XMM(0x3f800000 repeated four times) // 1 as a float32. + // flAlmost65536 := XMM(0x477fffff repeated four times) // 255.99998 * 256 as a float32. + MOVOU flSignMask<>(SB), X3 + MOVOU flOne<>(SB), X4 + MOVOU flAlmost65536<>(SB), X5 + ` + + fxAdd = `PADDD` + flAdd = `ADDPS` + + fxClampAndScale = ` + // y = abs(x) + // y >>= 2 // Shift by 2*ϕ - 16. + // y = min(y, fxAlmost65536) + // + // pabsd %xmm1,%xmm2 + // psrld $0x2,%xmm2 + // pminud %xmm5,%xmm2 + // + // Hopefully we'll get these opcode mnemonics into the assembler for Go + // 1.8. https://golang.org/issue/16007 isn't exactly the same thing, but + // it's similar. + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x1e; BYTE $0xd1 + BYTE $0x66; BYTE $0x0f; BYTE $0x72; BYTE $0xd2; BYTE $0x02 + BYTE $0x66; BYTE $0x0f; BYTE $0x38; BYTE $0x3b; BYTE $0xd5 + ` + flClampAndScale = ` + // y = x & flSignMask + // y = min(y, flOne) + // y = mul(y, flAlmost65536) + MOVOU X3, X2 + ANDPS X1, X2 + MINPS X4, X2 + MULPS X5, X2 + ` + + fxConvertToInt32 = ` + // z = convertToInt32(y) + // No-op. + ` + flConvertToInt32 = ` + // z = convertToInt32(y) + LDMXCSR mxcsrNew-4(SP) + CVTPS2PL X2, X2 + LDMXCSR mxcsrOrig-8(SP) + ` + + opOverStore4 = ` + // Blend over the dst's prior value. SIMD for i in 0..3: + // + // dstA := uint32(dst[i]) * 0x101 + // maskA := z@i + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[i] = uint8(outA >> 8) + // + // First, set X0 to dstA*(0xfff-maskA). + MOVL (DI), X0 + PSHUFB X8, X0 + MOVOU X9, X11 + PSUBL X2, X11 + PMULLD X11, X0 + // We implement uint32 division by 0xffff as multiplication by a magic + // constant (0x800080001) and then a shift by a magic constant (47). + // See TestDivideByFFFF for a justification. + // + // That multiplication widens from uint32 to uint64, so we have to + // duplicate and shift our four uint32s from one XMM register (X0) to + // two XMM registers (X0 and X11). + // + // Move the second and fourth uint32s in X0 to be the first and third + // uint32s in X11. + MOVOU X0, X11 + PSRLQ $32, X11 + // Multiply by magic, shift by magic. + // + // pmuludq %xmm10,%xmm0 + // pmuludq %xmm10,%xmm11 + BYTE $0x66; BYTE $0x41; BYTE $0x0f; BYTE $0xf4; BYTE $0xc2 + BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0xf4; BYTE $0xda + PSRLQ $47, X0 + PSRLQ $47, X11 + // Merge the two registers back to one, X11, and add maskA. + PSLLQ $32, X11 + XORPS X0, X11 + PADDD X11, X2 + // As per opSrcStore4, shuffle and copy the 4 second-lowest bytes. + PSHUFB X6, X2 + MOVL X2, (DI) + ` + opSrcStore4 = ` + // z = shuffleTheSecondLowestBytesOfEach4ByteElement(z) + // copy(dst[:4], low4BytesOf(z)) + PSHUFB X6, X2 + MOVL X2, (DI) + ` + maskStore4 = ` + // copy(dst[:4], z) + MOVOU X2, (DI) + ` + + opOverStore1 = ` + // Blend over the dst's prior value. + // + // dstA := uint32(dst[0]) * 0x101 + // maskA := z + // outA := dstA*(0xffff-maskA)/0xffff + maskA + // dst[0] = uint8(outA >> 8) + MOVBLZX (DI), R12 + IMULL $0x101, R12 + MOVL X2, R13 + MOVL $0xffff, AX + SUBL R13, AX + MULL R12 // MULL's implicit arg is AX, and the result is stored in DX:AX. + MOVL $0x80008001, BX // Divide by 0xffff is to first multiply by a magic constant... + MULL BX // MULL's implicit arg is AX, and the result is stored in DX:AX. + SHRL $15, DX // ...and then shift by another magic constant (47 - 32 = 15). + ADDL DX, R13 + SHRL $8, R13 + MOVB R13, (DI) + ` + opSrcStore1 = ` + // dst[0] = uint8(z>>8) + MOVL X2, BX + SHRL $8, BX + MOVB BX, (DI) + ` + maskStore1 = ` + // dst[0] = uint32(z) + MOVL X2, (DI) + ` + + opOverXMM6 = `gather` + opSrcXMM6 = `gather` + maskXMM6 = `-` + + opOverXMM8 = `scatterAndMulBy0x101` + opSrcXMM8 = `-` + maskXMM8 = `-` + + opOverXMM9 = `fxAlmost65536` + opSrcXMM9 = `-` + maskXMM9 = `-` + + opOverXMM10 = `inverseFFFF` + opSrcXMM10 = `-` + maskXMM10 = `-` + + opOverLoadXMMRegs = ` + // gather := XMM(see above) // PSHUFB shuffle mask. + // scatterAndMulBy0x101 := XMM(see above) // PSHUFB shuffle mask. + // fxAlmost65536 := XMM(0x0000ffff repeated four times) // 0xffff. + // inverseFFFF := XMM(0x80008001 repeated four times) // Magic constant for dividing by 0xffff. + MOVOU gather<>(SB), X6 + MOVOU scatterAndMulBy0x101<>(SB), X8 + MOVOU fxAlmost65536<>(SB), X9 + MOVOU inverseFFFF<>(SB), X10 + ` + opSrcLoadXMMRegs = ` + // gather := XMM(see above) // PSHUFB shuffle mask. + MOVOU gather<>(SB), X6 + ` + maskLoadXMMRegs = `` +) -- cgit v1.2.3