View Javadoc

1   /**
2    * Copyright (c) 2008-2012, http://www.snakeyaml.org
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.yaml.snakeyaml.reader;
17  
18  /**
19   version: 1.1 / 2007-01-25
20   - changed BOM recognition ordering (longer boms first)
21  
22   Original pseudocode   : Thomas Weidenfeller
23   Implementation tweaked: Aki Nieminen
24   Implementation changed: Andrey Somov 
25   * UTF-32 removed because it is not supported by YAML
26   * no default encoding
27  
28   http://www.unicode.org/unicode/faq/utf_bom.html
29   BOMs:
30   00 00 FE FF    = UTF-32, big-endian
31   FF FE 00 00    = UTF-32, little-endian
32   EF BB BF       = UTF-8,
33   FE FF          = UTF-16, big-endian
34   FF FE          = UTF-16, little-endian
35  
36   Win2k Notepad:
37   Unicode format = UTF-16LE
38   ***/
39  
40  import java.io.IOException;
41  import java.io.InputStream;
42  import java.io.InputStreamReader;
43  import java.io.PushbackInputStream;
44  import java.io.Reader;
45  import java.nio.charset.Charset;
46  import java.nio.charset.CharsetDecoder;
47  import java.nio.charset.CodingErrorAction;
48  
49  /**
50   * Generic unicode textreader, which will use BOM mark to identify the encoding
51   * to be used. If BOM is not found then use a given default or system encoding.
52   */
53  public class UnicodeReader extends Reader {
54      private static final Charset UTF8 = Charset.forName("UTF-8");
55      private static final Charset UTF16BE = Charset.forName("UTF-16BE");
56      private static final Charset UTF16LE = Charset.forName("UTF-16LE");
57  
58      PushbackInputStream internalIn;
59      InputStreamReader internalIn2 = null;
60  
61      private static final int BOM_SIZE = 3;
62  
63      /**
64       * @param in
65       *            InputStream to be read
66       */
67      public UnicodeReader(InputStream in) {
68          internalIn = new PushbackInputStream(in, BOM_SIZE);
69      }
70  
71      /**
72       * Get stream encoding or NULL if stream is uninitialized. Call init() or
73       * read() method to initialize it.
74       */
75      public String getEncoding() {
76          return internalIn2.getEncoding();
77      }
78  
79      /**
80       * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
81       * back to the stream, only BOM bytes are skipped.
82       */
83      protected void init() throws IOException {
84          if (internalIn2 != null)
85              return;
86  
87          Charset encoding;
88          byte bom[] = new byte[BOM_SIZE];
89          int n, unread;
90          n = internalIn.read(bom, 0, bom.length);
91  
92          if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
93              encoding = UTF8;
94              unread = n - 3;
95          } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
96              encoding = UTF16BE;
97              unread = n - 2;
98          } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
99              encoding = UTF16LE;
100             unread = n - 2;
101         } else {
102             // Unicode BOM mark not found, unread all bytes
103             encoding = UTF8;
104             unread = n;
105         }
106 
107         if (unread > 0)
108             internalIn.unread(bom, (n - unread), unread);
109 
110         // Use given encoding
111         CharsetDecoder decoder = encoding.newDecoder().onUnmappableCharacter(
112                 CodingErrorAction.REPORT);
113         internalIn2 = new InputStreamReader(internalIn, decoder);
114     }
115 
116     public void close() throws IOException {
117         init();
118         internalIn2.close();
119     }
120 
121     public int read(char[] cbuf, int off, int len) throws IOException {
122         init();
123         return internalIn2.read(cbuf, off, len);
124     }
125 }