// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // Package encoding defines an interface for character encodings, such as Shift // JIS and Windows 1252, that can convert to and from UTF-8. // // Encoding implementations are provided in other packages, such as // golang.org/x/text/encoding/charmap and // golang.org/x/text/encoding/japanese. package encoding // import "golang.org/x/text/encoding" import ( "errors" "io" "strconv" "unicode/utf8" "golang.org/x/text/encoding/internal/identifier" "golang.org/x/text/transform" ) // TODO: // - There seems to be some inconsistency in when decoders return errors // and when not. Also documentation seems to suggest they shouldn't return // errors at all (except for UTF-16). // - Encoders seem to rely on or at least benefit from the input being in NFC // normal form. Perhaps add an example how users could prepare their output. // Encoding is a character set encoding that can be transformed to and from // UTF-8. type Encoding interface { // NewDecoder returns a Decoder. NewDecoder() *Decoder // NewEncoder returns an Encoder. NewEncoder() *Encoder } // A Decoder converts bytes to UTF-8. It implements transform.Transformer. // // Transforming source bytes that are not of that encoding will not result in an // error per se. Each byte that cannot be transcoded will be represented in the // output by the UTF-8 encoding of '\uFFFD', the replacement rune. type Decoder struct { transform.Transformer // This forces external creators of Decoders to use names in struct // initializers, allowing for future extendibility without having to break // code. _ struct{} } // Bytes converts the given encoded bytes to UTF-8. It returns the converted // bytes or nil, err if any error occurred. func (d *Decoder) Bytes(b []byte) ([]byte, error) { b, _, err := transform.Bytes(d, b) if err != nil { return nil, err } return b, nil } // String converts the given encoded string to UTF-8. It returns the converted // string or "", err if any error occurred. func (d *Decoder) String(s string) (string, error) { s, _, err := transform.String(d, s) if err != nil { return "", err } return s, nil } // Reader wraps another Reader to decode its bytes. // // The Decoder may not be used for any other operation as long as the returned // Reader is in use. func (d *Decoder) Reader(r io.Reader) io.Reader { return transform.NewReader(r, d) } // An Encoder converts bytes from UTF-8. It implements transform.Transformer. // // Each rune that cannot be transcoded will result in an error. In this case, // the transform will consume all source byte up to, not including the offending // rune. Transforming source bytes that are not valid UTF-8 will be replaced by // `\uFFFD`. To return early with an error instead, use transform.Chain to // preprocess the data with a UTF8Validator. type Encoder struct { transform.Transformer // This forces external creators of Encoders to use names in struct // initializers, allowing for future extendibility without having to break // code. _ struct{} } // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if // any error occurred. func (e *Encoder) Bytes(b []byte) ([]byte, error) { b, _, err := transform.Bytes(e, b) if err != nil { return nil, err } return b, nil } // String converts a string from UTF-8. It returns the converted string or // "", err if any error occurred. func (e *Encoder) String(s string) (string, error) { s, _, err := transform.String(e, s) if err != nil { return "", err } return s, nil } // Writer wraps another Writer to encode its UTF-8 output. // // The Encoder may not be used for any other operation as long as the returned // Writer is in use. func (e *Encoder) Writer(w io.Writer) io.Writer { return transform.NewWriter(w, e) } // ASCIISub is the ASCII substitute character, as recommended by // http://unicode.org/reports/tr36/#Text_Comparison const ASCIISub = '\x1a' // Nop is the nop encoding. Its transformed bytes are the same as the source // bytes; it does not replace invalid UTF-8 sequences. var Nop Encoding = nop{} type nop struct{} func (nop) NewDecoder() *Decoder { return &Decoder{Transformer: transform.Nop} } func (nop) NewEncoder() *Encoder { return &Encoder{Transformer: transform.Nop} } // Replacement is the replacement encoding. Decoding from the replacement // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to // the replacement encoding yields the same as the source bytes except that // invalid UTF-8 is converted to '\uFFFD'. // // It is defined at http://encoding.spec.whatwg.org/#replacement var Replacement Encoding = replacement{} type replacement struct{} func (replacement) NewDecoder() *Decoder { return &Decoder{Transformer: replacementDecoder{}} } func (replacement) NewEncoder() *Encoder { return &Encoder{Transformer: replacementEncoder{}} } func (replacement) ID() (mib identifier.MIB, other string) { return identifier.Replacement, "" } type replacementDecoder struct{ transform.NopResetter } func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { if len(dst) < 3 { return 0, 0, transform.ErrShortDst } if atEOF { const fffd = "\ufffd" dst[0] = fffd[0] dst[1] = fffd[1] dst[2] = fffd[2] nDst = 3 } return nDst, len(src), nil } type replacementEncoder struct{ transform.NopResetter } func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { r, size := rune(0), 0 for ; nSrc < len(src); nSrc += size { r = rune(src[nSrc]) // Decode a 1-byte rune. if r < utf8.RuneSelf { size = 1 } else { // Decode a multi-byte rune. r, size = utf8.DecodeRune(src[nSrc:]) if size == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. if !atEOF && !utf8.FullRune(src[nSrc:]) { err = transform.ErrShortSrc break } r = '\ufffd' } } if nDst+utf8.RuneLen(r) > len(dst) { err = transform.ErrShortDst break } nDst += utf8.EncodeRune(dst[nDst:], r) } return nDst, nSrc, err } // HTMLEscapeUnsupported wraps encoders to replace source runes outside the // repertoire of the destination encoding with HTML escape sequences. // // This wrapper exists to comply to URL and HTML forms requiring a // non-terminating legacy encoder. The produced sequences may lead to data // loss as they are indistinguishable from legitimate input. To avoid this // issue, use UTF-8 encodings whenever possible. func HTMLEscapeUnsupported(e *Encoder) *Encoder { return &Encoder{Transformer: &errorHandler{e, errorToHTML}} } // ReplaceUnsupported wraps encoders to replace source runes outside the // repertoire of the destination encoding with an encoding-specific // replacement. // // This wrapper is only provided for backwards compatibility and legacy // handling. Its use is strongly discouraged. Use UTF-8 whenever possible. func ReplaceUnsupported(e *Encoder) *Encoder { return &Encoder{Transformer: &errorHandler{e, errorToReplacement}} } type errorHandler struct { *Encoder handler func(dst []byte, r rune, err repertoireError) (n int, ok bool) } // TODO: consider making this error public in some form. type repertoireError interface { Replacement() byte } func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF) for err != nil { rerr, ok := err.(repertoireError) if !ok { return nDst, nSrc, err } r, sz := utf8.DecodeRune(src[nSrc:]) n, ok := h.handler(dst[nDst:], r, rerr) if !ok { return nDst, nSrc, transform.ErrShortDst } err = nil nDst += n if nSrc += sz; nSrc < len(src) { var dn, sn int dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF) nDst += dn nSrc += sn } } return nDst, nSrc, err } func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) { buf := [8]byte{} b := strconv.AppendUint(buf[:0], uint64(r), 10) if n = len(b) + len("&#;"); n >= len(dst) { return 0, false } dst[0] = '&' dst[1] = '#' dst[copy(dst[2:], b)+2] = ';' return n, true } func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) { if len(dst) == 0 { return 0, false } dst[0] = err.Replacement() return 1, true } // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8. var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8") // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first // input byte that is not valid UTF-8. var UTF8Validator transform.Transformer = utf8Validator{} type utf8Validator struct{ transform.NopResetter } func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { n := len(src) if n > len(dst) { n = len(dst) } for i := 0; i < n; { if c := src[i]; c < utf8.RuneSelf { dst[i] = c i++ continue } _, size := utf8.DecodeRune(src[i:]) if size == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. err = ErrInvalidUTF8 if !atEOF && !utf8.FullRune(src[i:]) { err = transform.ErrShortSrc } return i, i, err } if i+size > len(dst) { return i, i, transform.ErrShortDst } for ; size > 0; size-- { dst[i] = src[i] i++ } } if len(src) > len(dst) { err = transform.ErrShortDst } return n, n, err }