View Javadoc

1   /**
2    * Copyright (c) 2008-2012, http://www.snakeyaml.org
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.yaml.snakeyaml.issues.issue148;
17  
18  import java.util.Formatter;
19  
20  import junit.framework.TestCase;
21  
22  import org.yaml.snakeyaml.DumperOptions;
23  import org.yaml.snakeyaml.DumperOptions.ScalarStyle;
24  import org.yaml.snakeyaml.Yaml;
25  import org.yaml.snakeyaml.reader.ReaderException;
26  
27  public class PrintableUnicodeTest extends TestCase {
28      public void testFFFD() {
29          Yaml yaml = createYaml();
30          String fffd = yaml.dump("\uFFFD");
31          assertEquals("\"\\ufffd\"\n", fffd);
32      }
33  
34      public void testSerialization() {
35          // test serialization of all Unicode codepoints
36          Yaml yaml = createYaml();
37          for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
38              String original = Character.toString((char) c);
39              String serialized = yaml.dump(original);
40  
41              // "On output, a YAML processor must only produce these acceptable
42              // characters,
43              // and should also escape all non-printable Unicode characters."
44              for (int i = 0; i < serialized.length(); i++) {
45                  int cp = (int) serialized.charAt(i);
46                  if (!isAcceptable(cp))
47                      fail(String.format(
48                              "U+%04x: Serialization produced result with unacceptable U+%04x\n", c,
49                              cp));
50                  if (!isPrintable(cp))
51                      fail(String.format(
52                              "U+%04x: Serialization produced result with nonprintable U+%04x\n", c,
53                              cp));
54              }
55          }
56      }
57  
58      public void testDeserialization() {
59          // test deserialization of non-escaped codepoints
60          for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
61              // ignore breaks, which have special meaning
62              if (c == 0x0A || c == 0x0D || c == 0x85 || c == 0x2028 || c == 0x2029)
63                  continue;
64              if (!isAcceptable(c) || c == 0x27)
65                  continue;
66              String expected = Character.toString((char) c);
67              String serialized = "'" + expected + "'";
68  
69              String result;
70              try {
71                  result = new Yaml().load(serialized).toString();
72              } catch (ReaderException e) {
73                  fail(String
74                          .format("U+%04x: Deserialization threw ReaderException for an acceptable character\n",
75                                  c));
76                  continue;
77              }
78              if (!result.equals(expected))
79                  fail(String.format("U+%04x: Deserialization incorrect: %s\n", c, hexdump(result)));
80          }
81      }
82  
83      public void testDeserialization2() {
84          // test deserialization of escaped codepoints
85          // "Any such characters must be presented using escape sequences."
86          for (int c = Character.MIN_VALUE; c <= Character.MAX_VALUE; c++) {
87              String expected = Character.toString((char) c);
88              String serialized = String.format("\"\\u%04x\"", c);
89  
90              String result;
91              try {
92                  result = new Yaml().load(serialized).toString();
93              } catch (ReaderException e) {
94                  fail(String
95                          .format("U+%04x: Deserialization threw ReaderException for an acceptable escaped character\n",
96                                  c));
97                  continue;
98              }
99              if (!result.equals(expected))
100                 fail(String.format("U+%04x: Deserialization of escaped character incorrect: %s\n",
101                         c, hexdump(result)));
102         }
103     }
104 
105     private Yaml createYaml() {
106         DumperOptions options = new DumperOptions();
107         options.setAllowUnicode(false);
108         options.setDefaultScalarStyle(ScalarStyle.DOUBLE_QUOTED);
109         return new Yaml(options);
110     }
111 
112     /**
113      * Test whether a character is printable, according to the YAML spec.
114      * ('c-printable')
115      */
116     public static boolean isPrintable(int c) {
117         return c == 0x9 || c == 0xA || c == 0xD || (c >= 0x20 && c <= 0x7E) // 8
118                                                                             // bit
119                 || c == 0x85 || (c >= 0xA0 && c <= 0xD7FF) || (c >= 0xE000 && c <= 0xFFFD) // 16
120                                                                                            // bit
121                 || (c >= 0x10000 && c <= 0x10FFFF); // 32 bit
122     }
123 
124     /**
125      * "On input, a YAML processor must accept all printable ASCII characters,
126      * the space, tab, line break, and all Unicode characters beyond #x9F. On
127      * output, a YAML processor must only produce these acceptable characters,
128      * and should also escape all non-printable Unicode characters. The allowed
129      * character range explicitly excludes the surrogate block #xD800-#xDFFF,
130      * DEL #x7F, the C0 control block #x0-#x1F (except for #x9, #xA, and #xD),
131      * the C1 control block #x80-#x9F, #xFFFE, and #xFFFF."
132      */
133     public static boolean isAcceptable(int c) {
134         return (c >= 0x20 && c <= 0x7e // accept all printable ASCII characters,
135                                        // the space,
136                 || c == 0x09 // tab,
137                 || c == 0x0A || c == 0x0D || c == 0x85 || c == 0x2028 || c == 0x2029 // line
138                                                                                      // break,
139         || isUnicodeCharacter(c) && c >= 0x9F // and all Unicode characters
140                                               // beyond #x9F
141         ) && !( // The allowed character range explicitly excludes
142                 c >= 0xD800 && c <= 0xDFFF // the surrogate block #xD800-#xDFFF
143                         || c == 0x7f // DEL #x7F,
144                         || c <= 0x1F && !(c == 0x09 || c == 0x0A || c == 0x0D) // the
145                                                                                // C0
146                                                                                // control
147                                                                                // block
148                                                                                // #x0-#x1F
149                                                                                // (except
150                                                                                // for
151                                                                                // #x9,
152                                                                                // #xA,
153                                                                                // and
154                                                                                // #xD),
155                         || c >= 0x80 && c <= 0x9F // the C1 control block
156                                                   // #x80-#x9F,
157                         || c == 0xFFFE // #xFFFE,
158                 || c == 0xFFFF // and #xFFFF.
159                 );
160     }
161 
162     /**
163      * Tests whether a codepoint is a designated Unicode noncharacter or not.
164      */
165     public static boolean isUnicodeCharacter(int c) {
166         int plane = c / 0x10000;
167         return !(c >= 0xFDD0 && c <= 0xFDEF) && (plane <= 16 && (c & 0xFFFE) != 0xFFFE);
168     }
169 
170     public static String hexdump(String input) {
171         StringBuilder result = new StringBuilder();
172         Formatter formatter = new Formatter(result);
173         for (int i = 0; i < input.length(); i++)
174             formatter.format("%02x ", (int) input.charAt(i));
175         return result.toString();
176     }
177 }