View Javadoc

1   /**
2    * Copyright (c) 2008-2011, http://www.snakeyaml.org
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.yaml.snakeyaml.reader;
18  
19  /**
20   version: 1.1 / 2007-01-25
21   - changed BOM recognition ordering (longer boms first)
22  
23   Original pseudocode   : Thomas Weidenfeller
24   Implementation tweaked: Aki Nieminen
25   Implementation changed: Andrey Somov 
26   * UTF-32 removed because it is not supported by YAML
27   * no default encoding
28  
29   http://www.unicode.org/unicode/faq/utf_bom.html
30   BOMs:
31   00 00 FE FF    = UTF-32, big-endian
32   FF FE 00 00    = UTF-32, little-endian
33   EF BB BF       = UTF-8,
34   FE FF          = UTF-16, big-endian
35   FF FE          = UTF-16, little-endian
36  
37   Win2k Notepad:
38   Unicode format = UTF-16LE
39   ***/
40  
41  import java.io.IOException;
42  import java.io.InputStream;
43  import java.io.InputStreamReader;
44  import java.io.PushbackInputStream;
45  import java.io.Reader;
46  
47  /**
48   * Generic unicode textreader, which will use BOM mark to identify the encoding
49   * to be used. If BOM is not found then use a given default or system encoding.
50   */
51  public class UnicodeReader extends Reader {
52      PushbackInputStream internalIn;
53      InputStreamReader internalIn2 = null;
54  
55      private static final int BOM_SIZE = 3;
56  
57      /**
58       * @param in
59       *            InputStream to be read
60       */
61      public UnicodeReader(InputStream in) {
62          internalIn = new PushbackInputStream(in, BOM_SIZE);
63      }
64  
65      /**
66       * Get stream encoding or NULL if stream is uninitialized. Call init() or
67       * read() method to initialize it.
68       */
69      public String getEncoding() {
70          return internalIn2.getEncoding();
71      }
72  
73      /**
74       * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
75       * back to the stream, only BOM bytes are skipped.
76       */
77      protected void init() throws IOException {
78          if (internalIn2 != null)
79              return;
80  
81          String encoding;
82          byte bom[] = new byte[BOM_SIZE];
83          int n, unread;
84          n = internalIn.read(bom, 0, bom.length);
85  
86          if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
87              encoding = "UTF-8";
88              unread = n - 3;
89          } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
90              encoding = "UTF-16BE";
91              unread = n - 2;
92          } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
93              encoding = "UTF-16LE";
94              unread = n - 2;
95          } else {
96              // Unicode BOM mark not found, unread all bytes
97              encoding = "UTF-8";
98              unread = n;
99          }
100 
101         if (unread > 0)
102             internalIn.unread(bom, (n - unread), unread);
103 
104         // Use given encoding
105         internalIn2 = new InputStreamReader(internalIn, encoding);
106     }
107 
108     public void close() throws IOException {
109         init();
110         internalIn2.close();
111     }
112 
113     public int read(char[] cbuf, int off, int len) throws IOException {
114         init();
115         return internalIn2.read(cbuf, off, len);
116     }
117 }