// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package japanese import ( "unicode/utf8" "golang.org/x/text/encoding" "golang.org/x/text/encoding/internal" "golang.org/x/text/encoding/internal/identifier" "golang.org/x/text/transform" ) // ISO2022JP is the ISO-2022-JP encoding. var ISO2022JP encoding.Encoding = &iso2022JP var iso2022JP = internal.Encoding{ internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder}, "ISO-2022-JP", identifier.ISO2022JP, } func iso2022JPNewDecoder() transform.Transformer { return new(iso2022JPDecoder) } func iso2022JPNewEncoder() transform.Transformer { return new(iso2022JPEncoder) } const ( asciiState = iota katakanaState jis0208State jis0212State ) const asciiEsc = 0x1b type iso2022JPDecoder int func (d *iso2022JPDecoder) Reset() { *d = asciiState } func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { r, size := rune(0), 0 for ; nSrc < len(src); nSrc += size { c0 := src[nSrc] if c0 >= utf8.RuneSelf { r, size = '\ufffd', 1 goto write } if c0 == asciiEsc { if nSrc+2 >= len(src) { if !atEOF { return nDst, nSrc, transform.ErrShortSrc } // TODO: is it correct to only skip 1?? r, size = '\ufffd', 1 goto write } size = 3 c1 := src[nSrc+1] c2 := src[nSrc+2] switch { case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42} *d = jis0208State continue case c1 == '$' && c2 == '(': // 0x24 0x28 if nSrc+3 >= len(src) { if !atEOF { return nDst, nSrc, transform.ErrShortSrc } r, size = '\ufffd', 1 goto write } size = 4 if src[nSrc+3] == 'D' { *d = jis0212State continue } case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A} *d = asciiState continue case c1 == '(' && c2 == 'I': // 0x28 0x49 *d = katakanaState continue } r, size = '\ufffd', 1 goto write } switch *d { case asciiState: r, size = rune(c0), 1 case katakanaState: if c0 < 0x21 || 0x60 <= c0 { r, size = '\ufffd', 1 goto write } r, size = rune(c0)+(0xff61-0x21), 1 default: if c0 == 0x0a { *d = asciiState r, size = rune(c0), 1 goto write } if nSrc+1 >= len(src) { if !atEOF { return nDst, nSrc, transform.ErrShortSrc } r, size = '\ufffd', 1 goto write } size = 2 c1 := src[nSrc+1] i := int(c0-0x21)*94 + int(c1-0x21) if *d == jis0208State && i < len(jis0208Decode) { r = rune(jis0208Decode[i]) } else if *d == jis0212State && i < len(jis0212Decode) { r = rune(jis0212Decode[i]) } else { r = '\ufffd' goto write } if r == 0 { r = '\ufffd' } } write: if nDst+utf8.RuneLen(r) > len(dst) { return nDst, nSrc, transform.ErrShortDst } nDst += utf8.EncodeRune(dst[nDst:], r) } return nDst, nSrc, err } type iso2022JPEncoder int func (e *iso2022JPEncoder) Reset() { *e = asciiState } func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { r, size := rune(0), 0 for ; nSrc < len(src); nSrc += size { r = rune(src[nSrc]) // Decode a 1-byte rune. if r < utf8.RuneSelf { size = 1 } else { // Decode a multi-byte rune. r, size = utf8.DecodeRune(src[nSrc:]) if size == 1 { // All valid runes of size 1 (those below utf8.RuneSelf) were // handled above. We have invalid UTF-8 or we haven't seen the // full character yet. if !atEOF && !utf8.FullRune(src[nSrc:]) { err = transform.ErrShortSrc break } } // func init checks that the switch covers all tables. // // http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212 // is not used by the iso-2022-jp encoder due to lack of widespread support". // // TODO: do we have to special-case U+00A5 and U+203E, as per // http://encoding.spec.whatwg.org/#iso-2022-jp // Doing so would mean that "\u00a5" would not be preserved // after an encode-decode round trip. switch { case encode0Low <= r && r < encode0High: if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 { goto writeJIS } case encode1Low <= r && r < encode1High: if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 { goto writeJIS } case encode2Low <= r && r < encode2High: if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 { goto writeJIS } case encode3Low <= r && r < encode3High: if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 { goto writeJIS } case encode4Low <= r && r < encode4High: if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 { goto writeJIS } case encode5Low <= r && r < encode5High: if 0xff61 <= r && r < 0xffa0 { goto writeKatakana } if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 { goto writeJIS } } // Switch back to ASCII state in case of error so that an ASCII // replacement character can be written in the correct state. if *e != asciiState { if nDst+3 > len(dst) { err = transform.ErrShortDst break } *e = asciiState dst[nDst+0] = asciiEsc dst[nDst+1] = '(' dst[nDst+2] = 'B' nDst += 3 } err = internal.ErrASCIIReplacement break } if *e != asciiState { if nDst+4 > len(dst) { err = transform.ErrShortDst break } *e = asciiState dst[nDst+0] = asciiEsc dst[nDst+1] = '(' dst[nDst+2] = 'B' nDst += 3 } else if nDst >= len(dst) { err = transform.ErrShortDst break } dst[nDst] = uint8(r) nDst++ continue writeJIS: if *e != jis0208State { if nDst+5 > len(dst) { err = transform.ErrShortDst break } *e = jis0208State dst[nDst+0] = asciiEsc dst[nDst+1] = '$' dst[nDst+2] = 'B' nDst += 3 } else if nDst+2 > len(dst) { err = transform.ErrShortDst break } dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask dst[nDst+1] = 0x21 + uint8(r)&codeMask nDst += 2 continue writeKatakana: if *e != katakanaState { if nDst+4 > len(dst) { err = transform.ErrShortDst break } *e = katakanaState dst[nDst+0] = asciiEsc dst[nDst+1] = '(' dst[nDst+2] = 'I' nDst += 3 } else if nDst >= len(dst) { err = transform.ErrShortDst break } dst[nDst] = uint8(r - (0xff61 - 0x21)) nDst++ continue } if atEOF && err == nil && *e != asciiState { if nDst+3 > len(dst) { err = transform.ErrShortDst } else { *e = asciiState dst[nDst+0] = asciiEsc dst[nDst+1] = '(' dst[nDst+2] = 'B' nDst += 3 } } return nDst, nSrc, err }