View Javadoc

1   /**
2    * Copyright (c) 2008-2012, http://www.snakeyaml.org
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.yaml.snakeyaml.scanner;
17  
18  import java.nio.ByteBuffer;
19  import java.nio.charset.CharacterCodingException;
20  import java.util.ArrayList;
21  import java.util.HashMap;
22  import java.util.Iterator;
23  import java.util.LinkedHashMap;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.regex.Pattern;
27  
28  import org.yaml.snakeyaml.error.Mark;
29  import org.yaml.snakeyaml.error.YAMLException;
30  import org.yaml.snakeyaml.reader.StreamReader;
31  import org.yaml.snakeyaml.tokens.AliasToken;
32  import org.yaml.snakeyaml.tokens.AnchorToken;
33  import org.yaml.snakeyaml.tokens.BlockEndToken;
34  import org.yaml.snakeyaml.tokens.BlockEntryToken;
35  import org.yaml.snakeyaml.tokens.BlockMappingStartToken;
36  import org.yaml.snakeyaml.tokens.BlockSequenceStartToken;
37  import org.yaml.snakeyaml.tokens.DirectiveToken;
38  import org.yaml.snakeyaml.tokens.DocumentEndToken;
39  import org.yaml.snakeyaml.tokens.DocumentStartToken;
40  import org.yaml.snakeyaml.tokens.FlowEntryToken;
41  import org.yaml.snakeyaml.tokens.FlowMappingEndToken;
42  import org.yaml.snakeyaml.tokens.FlowMappingStartToken;
43  import org.yaml.snakeyaml.tokens.FlowSequenceEndToken;
44  import org.yaml.snakeyaml.tokens.FlowSequenceStartToken;
45  import org.yaml.snakeyaml.tokens.KeyToken;
46  import org.yaml.snakeyaml.tokens.ScalarToken;
47  import org.yaml.snakeyaml.tokens.StreamEndToken;
48  import org.yaml.snakeyaml.tokens.StreamStartToken;
49  import org.yaml.snakeyaml.tokens.TagToken;
50  import org.yaml.snakeyaml.tokens.TagTuple;
51  import org.yaml.snakeyaml.tokens.Token;
52  import org.yaml.snakeyaml.tokens.ValueToken;
53  import org.yaml.snakeyaml.util.ArrayStack;
54  import org.yaml.snakeyaml.util.UriEncoder;
55  
56  /**
57   * <pre>
58   * Scanner produces tokens of the following types:
59   * STREAM-START
60   * STREAM-END
61   * DIRECTIVE(name, value)
62   * DOCUMENT-START
63   * DOCUMENT-END
64   * BLOCK-SEQUENCE-START
65   * BLOCK-MAPPING-START
66   * BLOCK-END
67   * FLOW-SEQUENCE-START
68   * FLOW-MAPPING-START
69   * FLOW-SEQUENCE-END
70   * FLOW-MAPPING-END
71   * BLOCK-ENTRY
72   * FLOW-ENTRY
73   * KEY
74   * VALUE
75   * ALIAS(value)
76   * ANCHOR(value)
77   * TAG(value)
78   * SCALAR(value, plain, style)
79   * Read comments in the Scanner code for more details.
80   * </pre>
81   */
82  public final class ScannerImpl implements Scanner {
83      /**
84       * A regular expression matching characters which are not in the hexadecimal
85       * set (0-9, A-F, a-f).
86       */
87      private final static Pattern NOT_HEXA = Pattern.compile("[^0-9A-Fa-f]");
88  
89      /**
90       * A mapping from an escaped character in the input stream to the character
91       * that they should be replaced with.
92       * 
93       * YAML defines several common and a few uncommon escape sequences.
94       * 
95       * @see <a href="http://www.yaml.org/spec/current.html#id2517668">4.1.6.
96       *      Escape Sequences</a>
97       */
98      public final static Map<Character, String> ESCAPE_REPLACEMENTS = new HashMap<Character, String>();
99  
100     /**
101      * A mapping from a character to a number of bytes to read-ahead for that
102      * escape sequence. These escape sequences are used to handle unicode
103      * escaping in the following formats, where H is a hexadecimal character:
104      * 
105      * <pre>
106      * &#92;xHH         : escaped 8-bit Unicode character
107      * &#92;uHHHH       : escaped 16-bit Unicode character
108      * &#92;UHHHHHHHH   : escaped 32-bit Unicode character
109      * </pre>
110      * 
111      * @see <a href="http://yaml.org/spec/1.1/current.html#id872840">5.6. Escape
112      *      Sequences</a>
113      */
114     public final static Map<Character, Integer> ESCAPE_CODES = new HashMap<Character, Integer>();
115 
116     static {
117         // ASCII null
118         ESCAPE_REPLACEMENTS.put(Character.valueOf('0'), "\0");
119         // ASCII bell
120         ESCAPE_REPLACEMENTS.put(Character.valueOf('a'), "\u0007");
121         // ASCII backspace
122         ESCAPE_REPLACEMENTS.put(Character.valueOf('b'), "\u0008");
123         // ASCII horizontal tab
124         ESCAPE_REPLACEMENTS.put(Character.valueOf('t'), "\u0009");
125         // ASCII newline (line feed; &#92;n maps to 0x0A)
126         ESCAPE_REPLACEMENTS.put(Character.valueOf('n'), "\n");
127         // ASCII vertical tab
128         ESCAPE_REPLACEMENTS.put(Character.valueOf('v'), "\u000B");
129         // ASCII form-feed
130         ESCAPE_REPLACEMENTS.put(Character.valueOf('f'), "\u000C");
131         // carriage-return (&#92;r maps to 0x0D)
132         ESCAPE_REPLACEMENTS.put(Character.valueOf('r'), "\r");
133         // ASCII escape character (Esc)
134         ESCAPE_REPLACEMENTS.put(Character.valueOf('e'), "\u001B");
135         // ASCII space
136         ESCAPE_REPLACEMENTS.put(Character.valueOf(' '), "\u0020");
137         // ASCII double-quote
138         ESCAPE_REPLACEMENTS.put(Character.valueOf('"'), "\"");
139         // ASCII backslash
140         ESCAPE_REPLACEMENTS.put(Character.valueOf('\\'), "\\");
141         // Unicode next line
142         ESCAPE_REPLACEMENTS.put(Character.valueOf('N'), "\u0085");
143         // Unicode non-breaking-space
144         ESCAPE_REPLACEMENTS.put(Character.valueOf('_'), "\u00A0");
145         // Unicode line-separator
146         ESCAPE_REPLACEMENTS.put(Character.valueOf('L'), "\u2028");
147         // Unicode paragraph separator
148         ESCAPE_REPLACEMENTS.put(Character.valueOf('P'), "\u2029");
149 
150         // 8-bit Unicode
151         ESCAPE_CODES.put(Character.valueOf('x'), 2);
152         // 16-bit Unicode
153         ESCAPE_CODES.put(Character.valueOf('u'), 4);
154         // 32-bit Unicode (Supplementary characters are supported)
155         ESCAPE_CODES.put(Character.valueOf('U'), 8);
156     }
157     private final StreamReader reader;
158     // Had we reached the end of the stream?
159     private boolean done = false;
160 
161     // The number of unclosed '{' and '['. `flow_level == 0` means block
162     // context.
163     private int flowLevel = 0;
164 
165     // List of processed tokens that are not yet emitted.
166     private List<Token> tokens;
167 
168     // Number of tokens that were emitted through the `get_token` method.
169     private int tokensTaken = 0;
170 
171     // The current indentation level.
172     private int indent = -1;
173 
174     // Past indentation levels.
175     private ArrayStack<Integer> indents;
176 
177     // Variables related to simple keys treatment. See PyYAML.
178 
179     /**
180      * <pre>
181      * A simple key is a key that is not denoted by the '?' indicator.
182      * Example of simple keys:
183      *   ---
184      *   block simple key: value
185      *   ? not a simple key:
186      *   : { flow simple key: value }
187      * We emit the KEY token before all keys, so when we find a potential
188      * simple key, we try to locate the corresponding ':' indicator.
189      * Simple keys should be limited to a single line and 1024 characters.
190      * 
191      * Can a simple key start at the current position? A simple key may
192      * start:
193      * - at the beginning of the line, not counting indentation spaces
194      *       (in block context),
195      * - after '{', '[', ',' (in the flow context),
196      * - after '?', ':', '-' (in the block context).
197      * In the block context, this flag also signifies if a block collection
198      * may start at the current position.
199      * </pre>
200      */
201     private boolean allowSimpleKey = true;
202 
203     /*
204      * Keep track of possible simple keys. This is a dictionary. The key is
205      * `flow_level`; there can be no more that one possible simple key for each
206      * level. The value is a SimpleKey record: (token_number, required, index,
207      * line, column, mark) A simple key may start with ALIAS, ANCHOR, TAG,
208      * SCALAR(flow), '[', or '{' tokens.
209      */
210     private Map<Integer, SimpleKey> possibleSimpleKeys;
211 
212     public ScannerImpl(StreamReader reader) {
213         this.reader = reader;
214         this.tokens = new ArrayList<Token>(100);
215         this.indents = new ArrayStack<Integer>(10);
216         // The order in possibleSimpleKeys is kept for nextPossibleSimpleKey()
217         this.possibleSimpleKeys = new LinkedHashMap<Integer, SimpleKey>();
218         fetchStreamStart();// Add the STREAM-START token.
219     }
220 
221     /**
222      * Check whether the next token is one of the given types.
223      */
224     public boolean checkToken(Token.ID... choices) {
225         while (needMoreTokens()) {
226             fetchMoreTokens();
227         }
228         if (!this.tokens.isEmpty()) {
229             if (choices.length == 0) {
230                 return true;
231             }
232             // since profiler puts this method on top (it is used a lot), we
233             // should not use 'foreach' here because of the performance reasons
234             Token.ID first = this.tokens.get(0).getTokenId();
235             for (int i = 0; i < choices.length; i++) {
236                 if (first == choices[i]) {
237                     return true;
238                 }
239             }
240         }
241         return false;
242     }
243 
244     /**
245      * Return the next token, but do not delete it from the queue.
246      */
247     public Token peekToken() {
248         while (needMoreTokens()) {
249             fetchMoreTokens();
250         }
251         return this.tokens.get(0);
252     }
253 
254     /**
255      * Return the next token, removing it from the queue.
256      */
257     public Token getToken() {
258         if (!this.tokens.isEmpty()) {
259             this.tokensTaken++;
260             return this.tokens.remove(0);
261         }
262         return null;
263     }
264 
265     // Private methods.
266     /**
267      * Returns true if more tokens should be scanned.
268      */
269     private boolean needMoreTokens() {
270         // If we are done, we do not require more tokens.
271         if (this.done) {
272             return false;
273         }
274         // If we aren't done, but we have no tokens, we need to scan more.
275         if (this.tokens.isEmpty()) {
276             return true;
277         }
278         // The current token may be a potential simple key, so we
279         // need to look further.
280         stalePossibleSimpleKeys();
281         return nextPossibleSimpleKey() == this.tokensTaken;
282     }
283 
284     /**
285      * Fetch one or more tokens from the StreamReader.
286      */
287     private void fetchMoreTokens() {
288         // Eat whitespaces and comments until we reach the next token.
289         scanToNextToken();
290         // Remove obsolete possible simple keys.
291         stalePossibleSimpleKeys();
292         // Compare the current indentation and column. It may add some tokens
293         // and decrease the current indentation level.
294         unwindIndent(reader.getColumn());
295         // Peek the next character, to decide what the next group of tokens
296         // will look like.
297         char ch = reader.peek();
298         switch (ch) {
299         case '\0':
300             // Is it the end of stream?
301             fetchStreamEnd();
302             return;
303         case '%':
304             // Is it a directive?
305             if (checkDirective()) {
306                 fetchDirective();
307                 return;
308             }
309             break;
310         case '-':
311             // Is it the document start?
312             if (checkDocumentStart()) {
313                 fetchDocumentStart();
314                 return;
315                 // Is it the block entry indicator?
316             } else if (checkBlockEntry()) {
317                 fetchBlockEntry();
318                 return;
319             }
320             break;
321         case '.':
322             // Is it the document end?
323             if (checkDocumentEnd()) {
324                 fetchDocumentEnd();
325                 return;
326             }
327             break;
328         // TODO support for BOM within a stream. (not implemented in PyYAML)
329         case '[':
330             // Is it the flow sequence start indicator?
331             fetchFlowSequenceStart();
332             return;
333         case '{':
334             // Is it the flow mapping start indicator?
335             fetchFlowMappingStart();
336             return;
337         case ']':
338             // Is it the flow sequence end indicator?
339             fetchFlowSequenceEnd();
340             return;
341         case '}':
342             // Is it the flow mapping end indicator?
343             fetchFlowMappingEnd();
344             return;
345         case ',':
346             // Is it the flow entry indicator?
347             fetchFlowEntry();
348             return;
349             // see block entry indicator above
350         case '?':
351             // Is it the key indicator?
352             if (checkKey()) {
353                 fetchKey();
354                 return;
355             }
356             break;
357         case ':':
358             // Is it the value indicator?
359             if (checkValue()) {
360                 fetchValue();
361                 return;
362             }
363             break;
364         case '*':
365             // Is it an alias?
366             fetchAlias();
367             return;
368         case '&':
369             // Is it an anchor?
370             fetchAnchor();
371             return;
372         case '!':
373             // Is it a tag?
374             fetchTag();
375             return;
376         case '|':
377             // Is it a literal scalar?
378             if (this.flowLevel == 0) {
379                 fetchLiteral();
380                 return;
381             }
382             break;
383         case '>':
384             // Is it a folded scalar?
385             if (this.flowLevel == 0) {
386                 fetchFolded();
387                 return;
388             }
389             break;
390         case '\'':
391             // Is it a single quoted scalar?
392             fetchSingle();
393             return;
394         case '"':
395             // Is it a double quoted scalar?
396             fetchDouble();
397             return;
398         }
399         // It must be a plain scalar then.
400         if (checkPlain()) {
401             fetchPlain();
402             return;
403         }
404         // No? It's an error. Let's produce a nice error message.We do this by
405         // converting escaped characters into their escape sequences. This is a
406         // backwards use of the ESCAPE_REPLACEMENTS map.
407         String chRepresentation = String.valueOf(ch);
408         for (Character s : ESCAPE_REPLACEMENTS.keySet()) {
409             String v = ESCAPE_REPLACEMENTS.get(s);
410             if (v.equals(chRepresentation)) {
411                 chRepresentation = "\\" + s;// ' ' -> '\t'
412                 break;
413             }
414         }
415         throw new ScannerException("while scanning for the next token", null, "found character "
416                 + ch + "'" + chRepresentation + "' that cannot start any token", reader.getMark());
417     }
418 
419     // Simple keys treatment.
420 
421     /**
422      * Return the number of the nearest possible simple key. Actually we don't
423      * need to loop through the whole dictionary.
424      */
425     private int nextPossibleSimpleKey() {
426         /*
427          * the implementation is not as in PyYAML. Because
428          * this.possibleSimpleKeys is ordered we can simply take the first key
429          */
430         if (!this.possibleSimpleKeys.isEmpty()) {
431             return this.possibleSimpleKeys.values().iterator().next().getTokenNumber();
432         }
433         return -1;
434     }
435 
436     /**
437      * <pre>
438      * Remove entries that are no longer possible simple keys. According to
439      * the YAML specification, simple keys
440      * - should be limited to a single line,
441      * - should be no longer than 1024 characters.
442      * Disabling this procedure will allow simple keys of any length and
443      * height (may cause problems if indentation is broken though).
444      * </pre>
445      */
446     private void stalePossibleSimpleKeys() {
447         if (!this.possibleSimpleKeys.isEmpty()) {
448             for (Iterator<SimpleKey> iterator = this.possibleSimpleKeys.values().iterator(); iterator
449                     .hasNext();) {
450                 SimpleKey key = iterator.next();
451                 if ((key.getLine() != reader.getLine())
452                         || (reader.getIndex() - key.getIndex() > 1024)) {
453                     // If the key is not on the same line as the current
454                     // position OR the difference in column between the token
455                     // start and the current position is more than the maximum
456                     // simple key length, then this cannot be a simple key.
457                     if (key.isRequired()) {
458                         // If the key was required, this implies an error
459                         // condition.
460                         throw new ScannerException("while scanning a simple key", key.getMark(),
461                                 "could not found expected ':'", reader.getMark());
462                     }
463                     iterator.remove();
464                 }
465             }
466         }
467     }
468 
469     /**
470      * The next token may start a simple key. We check if it's possible and save
471      * its position. This function is called for ALIAS, ANCHOR, TAG,
472      * SCALAR(flow), '[', and '{'.
473      */
474     private void savePossibleSimpleKey() {
475         // The next token may start a simple key. We check if it's possible
476         // and save its position. This function is called for
477         // ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
478 
479         // Check if a simple key is required at the current position.
480         // A simple key is required if this position is the root flowLevel, AND
481         // the current indentation level is the same as the last indent-level.
482         boolean required = ((this.flowLevel == 0) && (this.indent == this.reader.getColumn()));
483 
484         if (allowSimpleKey || !required) {
485             // A simple key is required only if it is the first token in the
486             // current line. Therefore it is always allowed.
487         } else {
488             throw new YAMLException(
489                     "A simple key is required only if it is the first token in the current line");
490         }
491 
492         // The next token might be a simple key. Let's save it's number and
493         // position.
494         if (this.allowSimpleKey) {
495             removePossibleSimpleKey();
496             int tokenNumber = this.tokensTaken + this.tokens.size();
497             SimpleKey key = new SimpleKey(tokenNumber, required, reader.getIndex(),
498                     reader.getLine(), this.reader.getColumn(), this.reader.getMark());
499             this.possibleSimpleKeys.put(this.flowLevel, key);
500         }
501     }
502 
503     /**
504      * Remove the saved possible key position at the current flow level.
505      */
506     private void removePossibleSimpleKey() {
507         SimpleKey key = possibleSimpleKeys.remove(flowLevel);
508         if (key != null && key.isRequired()) {
509             throw new ScannerException("while scanning a simple key", key.getMark(),
510                     "could not found expected ':'", reader.getMark());
511         }
512     }
513 
514     // Indentation functions.
515 
516     /**
517      * * Handle implicitly ending multiple levels of block nodes by decreased
518      * indentation. This function becomes important on lines 4 and 7 of this
519      * example:
520      * 
521      * <pre>
522      * 1) book one:
523      * 2)   part one:
524      * 3)     chapter one
525      * 4)   part two:
526      * 5)     chapter one
527      * 6)     chapter two
528      * 7) book two:
529      * </pre>
530      * 
531      * In flow context, tokens should respect indentation. Actually the
532      * condition should be `self.indent &gt;= column` according to the spec. But
533      * this condition will prohibit intuitively correct constructions such as
534      * key : { } </pre>
535      */
536     private void unwindIndent(int col) {
537         // In the flow context, indentation is ignored. We make the scanner less
538         // restrictive then specification requires.
539         if (this.flowLevel != 0) {
540             return;
541         }
542 
543         // In block context, we may need to issue the BLOCK-END tokens.
544         while (this.indent > col) {
545             Mark mark = reader.getMark();
546             this.indent = this.indents.pop();
547             this.tokens.add(new BlockEndToken(mark, mark));
548         }
549     }
550 
551     /**
552      * Check if we need to increase indentation.
553      */
554     private boolean addIndent(int column) {
555         if (this.indent < column) {
556             this.indents.push(this.indent);
557             this.indent = column;
558             return true;
559         }
560         return false;
561     }
562 
563     // Fetchers.
564 
565     /**
566      * We always add STREAM-START as the first token and STREAM-END as the last
567      * token.
568      */
569     private void fetchStreamStart() {
570         // Read the token.
571         Mark mark = reader.getMark();
572 
573         // Add STREAM-START.
574         Token token = new StreamStartToken(mark, mark);
575         this.tokens.add(token);
576     }
577 
578     private void fetchStreamEnd() {
579         // Set the current intendation to -1.
580         unwindIndent(-1);
581 
582         // Reset simple keys.
583         removePossibleSimpleKey();
584         this.allowSimpleKey = false;
585         this.possibleSimpleKeys.clear();
586 
587         // Read the token.
588         Mark mark = reader.getMark();
589 
590         // Add STREAM-END.
591         Token token = new StreamEndToken(mark, mark);
592         this.tokens.add(token);
593 
594         // The stream is finished.
595         this.done = true;
596     }
597 
598     /**
599      * Fetch a YAML directive. Directives are presentation details that are
600      * interpreted as instructions to the processor. YAML defines two kinds of
601      * directives, YAML and TAG; all other types are reserved for future use.
602      * 
603      * @see http://www.yaml.org/spec/1.1/#id864824
604      */
605     private void fetchDirective() {
606         // Set the current intendation to -1.
607         unwindIndent(-1);
608 
609         // Reset simple keys.
610         removePossibleSimpleKey();
611         this.allowSimpleKey = false;
612 
613         // Scan and add DIRECTIVE.
614         Token tok = scanDirective();
615         this.tokens.add(tok);
616     }
617 
618     /**
619      * Fetch a document-start token ("---").
620      */
621     private void fetchDocumentStart() {
622         fetchDocumentIndicator(true);
623     }
624 
625     /**
626      * Fetch a document-end token ("...").
627      */
628     private void fetchDocumentEnd() {
629         fetchDocumentIndicator(false);
630     }
631 
632     /**
633      * Fetch a document indicator, either "---" for "document-start", or else
634      * "..." for "document-end. The type is chosen by the given boolean.
635      */
636     private void fetchDocumentIndicator(boolean isDocumentStart) {
637         // Set the current intendation to -1.
638         unwindIndent(-1);
639 
640         // Reset simple keys. Note that there could not be a block collection
641         // after '---'.
642         removePossibleSimpleKey();
643         this.allowSimpleKey = false;
644 
645         // Add DOCUMENT-START or DOCUMENT-END.
646         Mark startMark = reader.getMark();
647         reader.forward(3);
648         Mark endMark = reader.getMark();
649         Token token;
650         if (isDocumentStart) {
651             token = new DocumentStartToken(startMark, endMark);
652         } else {
653             token = new DocumentEndToken(startMark, endMark);
654         }
655         this.tokens.add(token);
656     }
657 
658     private void fetchFlowSequenceStart() {
659         fetchFlowCollectionStart(false);
660     }
661 
662     private void fetchFlowMappingStart() {
663         fetchFlowCollectionStart(true);
664     }
665 
666     /**
667      * Fetch a flow-style collection start, which is either a sequence or a
668      * mapping. The type is determined by the given boolean.
669      * 
670      * A flow-style collection is in a format similar to JSON. Sequences are
671      * started by '[' and ended by ']'; mappings are started by '{' and ended by
672      * '}'.
673      * 
674      * @see http://www.yaml.org/spec/1.1/#id863975
675      * 
676      * @param isMappingStart
677      */
678     private void fetchFlowCollectionStart(boolean isMappingStart) {
679         // '[' and '{' may start a simple key.
680         savePossibleSimpleKey();
681 
682         // Increase the flow level.
683         this.flowLevel++;
684 
685         // Simple keys are allowed after '[' and '{'.
686         this.allowSimpleKey = true;
687 
688         // Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
689         Mark startMark = reader.getMark();
690         reader.forward(1);
691         Mark endMark = reader.getMark();
692         Token token;
693         if (isMappingStart) {
694             token = new FlowMappingStartToken(startMark, endMark);
695         } else {
696             token = new FlowSequenceStartToken(startMark, endMark);
697         }
698         this.tokens.add(token);
699     }
700 
701     private void fetchFlowSequenceEnd() {
702         fetchFlowCollectionEnd(false);
703     }
704 
705     private void fetchFlowMappingEnd() {
706         fetchFlowCollectionEnd(true);
707     }
708 
709     /**
710      * Fetch a flow-style collection end, which is either a sequence or a
711      * mapping. The type is determined by the given boolean.
712      * 
713      * A flow-style collection is in a format similar to JSON. Sequences are
714      * started by '[' and ended by ']'; mappings are started by '{' and ended by
715      * '}'.
716      * 
717      * @see http://www.yaml.org/spec/1.1/#id863975
718      */
719     private void fetchFlowCollectionEnd(boolean isMappingEnd) {
720         // Reset possible simple key on the current level.
721         removePossibleSimpleKey();
722 
723         // Decrease the flow level.
724         this.flowLevel--;
725 
726         // No simple keys after ']' or '}'.
727         this.allowSimpleKey = false;
728 
729         // Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
730         Mark startMark = reader.getMark();
731         reader.forward();
732         Mark endMark = reader.getMark();
733         Token token;
734         if (isMappingEnd) {
735             token = new FlowMappingEndToken(startMark, endMark);
736         } else {
737             token = new FlowSequenceEndToken(startMark, endMark);
738         }
739         this.tokens.add(token);
740     }
741 
742     /**
743      * Fetch an entry in the flow style. Flow-style entries occur either
744      * immediately after the start of a collection, or else after a comma.
745      * 
746      * @see http://www.yaml.org/spec/1.1/#id863975
747      */
748     private void fetchFlowEntry() {
749         // Simple keys are allowed after ','.
750         this.allowSimpleKey = true;
751 
752         // Reset possible simple key on the current level.
753         removePossibleSimpleKey();
754 
755         // Add FLOW-ENTRY.
756         Mark startMark = reader.getMark();
757         reader.forward();
758         Mark endMark = reader.getMark();
759         Token token = new FlowEntryToken(startMark, endMark);
760         this.tokens.add(token);
761     }
762 
763     /**
764      * Fetch an entry in the block style.
765      * 
766      * @see http://www.yaml.org/spec/1.1/#id863975
767      */
768     private void fetchBlockEntry() {
769         // Block context needs additional checks.
770         if (this.flowLevel == 0) {
771             // Are we allowed to start a new entry?
772             if (!this.allowSimpleKey) {
773                 throw new ScannerException(null, null, "sequence entries are not allowed here",
774                         reader.getMark());
775             }
776 
777             // We may need to add BLOCK-SEQUENCE-START.
778             if (addIndent(this.reader.getColumn())) {
779                 Mark mark = reader.getMark();
780                 this.tokens.add(new BlockSequenceStartToken(mark, mark));
781             }
782         } else {
783             // It's an error for the block entry to occur in the flow
784             // context,but we let the parser detect this.
785         }
786         // Simple keys are allowed after '-'.
787         this.allowSimpleKey = true;
788 
789         // Reset possible simple key on the current level.
790         removePossibleSimpleKey();
791 
792         // Add BLOCK-ENTRY.
793         Mark startMark = reader.getMark();
794         reader.forward();
795         Mark endMark = reader.getMark();
796         Token token = new BlockEntryToken(startMark, endMark);
797         this.tokens.add(token);
798     }
799 
800     /**
801      * Fetch a key in a block-style mapping.
802      * 
803      * @see http://www.yaml.org/spec/1.1/#id863975
804      */
805     private void fetchKey() {
806         // Block context needs additional checks.
807         if (this.flowLevel == 0) {
808             // Are we allowed to start a key (not necessary a simple)?
809             if (!this.allowSimpleKey) {
810                 throw new ScannerException(null, null, "mapping keys are not allowed here",
811                         reader.getMark());
812             }
813             // We may need to add BLOCK-MAPPING-START.
814             if (addIndent(this.reader.getColumn())) {
815                 Mark mark = reader.getMark();
816                 this.tokens.add(new BlockMappingStartToken(mark, mark));
817             }
818         }
819         // Simple keys are allowed after '?' in the block context.
820         this.allowSimpleKey = this.flowLevel == 0;
821 
822         // Reset possible simple key on the current level.
823         removePossibleSimpleKey();
824 
825         // Add KEY.
826         Mark startMark = reader.getMark();
827         reader.forward();
828         Mark endMark = reader.getMark();
829         Token token = new KeyToken(startMark, endMark);
830         this.tokens.add(token);
831     }
832 
833     /**
834      * Fetch a value in a block-style mapping.
835      * 
836      * @see http://www.yaml.org/spec/1.1/#id863975
837      */
838     private void fetchValue() {
839         // Do we determine a simple key?
840         SimpleKey key = this.possibleSimpleKeys.remove(this.flowLevel);
841         if (key != null) {
842             // Add KEY.
843             this.tokens.add(key.getTokenNumber() - this.tokensTaken, new KeyToken(key.getMark(),
844                     key.getMark()));
845 
846             // If this key starts a new block mapping, we need to add
847             // BLOCK-MAPPING-START.
848             if (this.flowLevel == 0) {
849                 if (addIndent(key.getColumn())) {
850                     this.tokens.add(key.getTokenNumber() - this.tokensTaken,
851                             new BlockMappingStartToken(key.getMark(), key.getMark()));
852                 }
853             }
854             // There cannot be two simple keys one after another.
855             this.allowSimpleKey = false;
856 
857         } else {// It must be a part of a complex key.
858             // Block context needs additional checks.Do we really need them?
859             // They
860             // will be catched by the parser anyway.)
861             if (this.flowLevel == 0) {
862 
863                 // We are allowed to start a complex value if and only if we can
864                 // start a simple key.
865                 if (!this.allowSimpleKey) {
866                     throw new ScannerException(null, null, "mapping values are not allowed here",
867                             reader.getMark());
868                 }
869             }
870 
871             // If this value starts a new block mapping, we need to add
872             // BLOCK-MAPPING-START. It will be detected as an error later by
873             // the parser.
874             if (flowLevel == 0) {
875                 if (addIndent(reader.getColumn())) {
876                     Mark mark = reader.getMark();
877                     this.tokens.add(new BlockMappingStartToken(mark, mark));
878                 }
879             }
880 
881             // Simple keys are allowed after ':' in the block context.
882             allowSimpleKey = (flowLevel == 0);
883 
884             // Reset possible simple key on the current level.
885             removePossibleSimpleKey();
886         }
887         // Add VALUE.
888         Mark startMark = reader.getMark();
889         reader.forward();
890         Mark endMark = reader.getMark();
891         Token token = new ValueToken(startMark, endMark);
892         this.tokens.add(token);
893     }
894 
895     /**
896      * Fetch an alias, which is a reference to an anchor. Aliases take the
897      * format:
898      * 
899      * <pre>
900      * *(anchor name)
901      * </pre>
902      * 
903      * @see http://www.yaml.org/spec/1.1/#id863390
904      */
905     private void fetchAlias() {
906         // ALIAS could be a simple key.
907         savePossibleSimpleKey();
908 
909         // No simple keys after ALIAS.
910         this.allowSimpleKey = false;
911 
912         // Scan and add ALIAS.
913         Token tok = scanAnchor(false);
914         this.tokens.add(tok);
915     }
916 
917     /**
918      * Fetch an anchor. Anchors take the form:
919      * 
920      * <pre>
921      * &(anchor name)
922      * </pre>
923      * 
924      * @see http://www.yaml.org/spec/1.1/#id863390
925      */
926     private void fetchAnchor() {
927         // ANCHOR could start a simple key.
928         savePossibleSimpleKey();
929 
930         // No simple keys after ANCHOR.
931         this.allowSimpleKey = false;
932 
933         // Scan and add ANCHOR.
934         Token tok = scanAnchor(true);
935         this.tokens.add(tok);
936     }
937 
938     /**
939      * Fetch a tag. Tags take a complex form.
940      * 
941      * @see http://www.yaml.org/spec/1.1/#id861700
942      */
943     private void fetchTag() {
944         // TAG could start a simple key.
945         savePossibleSimpleKey();
946 
947         // No simple keys after TAG.
948         this.allowSimpleKey = false;
949 
950         // Scan and add TAG.
951         Token tok = scanTag();
952         this.tokens.add(tok);
953     }
954 
955     /**
956      * Fetch a literal scalar, denoted with a vertical-bar. This is the type
957      * best used for source code and other content, such as binary data, which
958      * must be included verbatim.
959      * 
960      * @see http://www.yaml.org/spec/1.1/#id863975
961      */
962     private void fetchLiteral() {
963         fetchBlockScalar('|');
964     }
965 
966     /**
967      * Fetch a folded scalar, denoted with a greater-than sign. This is the type
968      * best used for long content, such as the text of a chapter or description.
969      * 
970      * @see http://www.yaml.org/spec/1.1/#id863975
971      */
972     private void fetchFolded() {
973         fetchBlockScalar('>');
974     }
975 
976     /**
977      * Fetch a block scalar (literal or folded).
978      * 
979      * @see http://www.yaml.org/spec/1.1/#id863975
980      * 
981      * @param style
982      */
983     private void fetchBlockScalar(char style) {
984         // A simple key may follow a block scalar.
985         this.allowSimpleKey = true;
986 
987         // Reset possible simple key on the current level.
988         removePossibleSimpleKey();
989 
990         // Scan and add SCALAR.
991         Token tok = scanBlockScalar(style);
992         this.tokens.add(tok);
993     }
994 
995     /**
996      * Fetch a single-quoted (') scalar.
997      */
998     private void fetchSingle() {
999         fetchFlowScalar('\'');
1000     }
1001 
1002     /**
1003      * Fetch a double-quoted (") scalar.
1004      */
1005     private void fetchDouble() {
1006         fetchFlowScalar('"');
1007     }
1008 
1009     /**
1010      * Fetch a flow scalar (single- or double-quoted).
1011      * 
1012      * @see http://www.yaml.org/spec/1.1/#id863975
1013      * 
1014      * @param style
1015      */
1016     private void fetchFlowScalar(char style) {
1017         // A flow scalar could be a simple key.
1018         savePossibleSimpleKey();
1019 
1020         // No simple keys after flow scalars.
1021         this.allowSimpleKey = false;
1022 
1023         // Scan and add SCALAR.
1024         Token tok = scanFlowScalar(style);
1025         this.tokens.add(tok);
1026     }
1027 
1028     /**
1029      * Fetch a plain scalar.
1030      */
1031     private void fetchPlain() {
1032         // A plain scalar could be a simple key.
1033         savePossibleSimpleKey();
1034 
1035         // No simple keys after plain scalars. But note that `scan_plain` will
1036         // change this flag if the scan is finished at the beginning of the
1037         // line.
1038         this.allowSimpleKey = false;
1039 
1040         // Scan and add SCALAR. May change `allow_simple_key`.
1041         Token tok = scanPlain();
1042         this.tokens.add(tok);
1043     }
1044 
1045     // Checkers.
1046     /**
1047      * Returns true if the next thing on the reader is a directive, given that
1048      * the leading '%' has already been checked.
1049      * 
1050      * @see http://www.yaml.org/spec/1.1/#id864824
1051      */
1052     private boolean checkDirective() {
1053         // DIRECTIVE: ^ '%' ...
1054         // The '%' indicator is already checked.
1055         return reader.getColumn() == 0;
1056     }
1057 
1058     /**
1059      * Returns true if the next thing on the reader is a document-start ("---").
1060      * A document-start is always followed immediately by a new line.
1061      */
1062     private boolean checkDocumentStart() {
1063         // DOCUMENT-START: ^ '---' (' '|'\n')
1064         if (reader.getColumn() == 0) {
1065             if ("---".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
1066                 return true;
1067             }
1068         }
1069         return false;
1070     }
1071 
1072     /**
1073      * Returns true if the next thing on the reader is a document-end ("..."). A
1074      * document-end is always followed immediately by a new line.
1075      */
1076     private boolean checkDocumentEnd() {
1077         // DOCUMENT-END: ^ '...' (' '|'\n')
1078         if (reader.getColumn() == 0) {
1079             if ("...".equals(reader.prefix(3)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
1080                 return true;
1081             }
1082         }
1083         return false;
1084     }
1085 
1086     /**
1087      * Returns true if the next thing on the reader is a block token.
1088      */
1089     private boolean checkBlockEntry() {
1090         // BLOCK-ENTRY: '-' (' '|'\n')
1091         return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
1092     }
1093 
1094     /**
1095      * Returns true if the next thing on the reader is a key token.
1096      */
1097     private boolean checkKey() {
1098         // KEY(flow context): '?'
1099         if (this.flowLevel != 0) {
1100             return true;
1101         } else {
1102             // KEY(block context): '?' (' '|'\n')
1103             return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
1104         }
1105     }
1106 
1107     /**
1108      * Returns true if the next thing on the reader is a value token.
1109      */
1110     private boolean checkValue() {
1111         // VALUE(flow context): ':'
1112         if (flowLevel != 0) {
1113             return true;
1114         } else {
1115             // VALUE(block context): ':' (' '|'\n')
1116             return Constant.NULL_BL_T_LINEBR.has(reader.peek(1));
1117         }
1118     }
1119 
1120     /**
1121      * Returns true if the next thing on the reader is a plain token.
1122      */
1123     private boolean checkPlain() {
1124         /**
1125          * <pre>
1126          * A plain scalar may start with any non-space character except:
1127          *   '-', '?', ':', ',', '[', ']', '{', '}',
1128          *   '#', '&amp;', '*', '!', '|', '&gt;', '\'', '\&quot;',
1129          *   '%', '@', '`'.
1130          * 
1131          * It may also start with
1132          *   '-', '?', ':'
1133          * if it is followed by a non-space character.
1134          * 
1135          * Note that we limit the last rule to the block context (except the
1136          * '-' character) because we want the flow context to be space
1137          * independent.
1138          * </pre>
1139          */
1140         char ch = reader.peek();
1141         // If the next char is NOT one of the forbidden chars above or
1142         // whitespace, then this is the start of a plain scalar.
1143         return Constant.NULL_BL_T_LINEBR.hasNo(ch, "-?:,[]{}#&*!|>\'\"%@`")
1144                 || (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(1)) && (ch == '-' || (this.flowLevel == 0 && "?:"
1145                         .indexOf(ch) != -1)));
1146     }
1147 
1148     // Scanners.
1149 
1150     /**
1151      * <pre>
1152      * We ignore spaces, line breaks and comments.
1153      * If we find a line break in the block context, we set the flag
1154      * `allow_simple_key` on.
1155      * The byte order mark is stripped if it's the first character in the
1156      * stream. We do not yet support BOM inside the stream as the
1157      * specification requires. Any such mark will be considered as a part
1158      * of the document.
1159      * TODO: We need to make tab handling rules more sane. A good rule is
1160      *   Tabs cannot precede tokens
1161      *   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
1162      *   KEY(block), VALUE(block), BLOCK-ENTRY
1163      * So the checking code is
1164      *   if &lt;TAB&gt;:
1165      *       self.allow_simple_keys = False
1166      * We also need to add the check for `allow_simple_keys == True` to
1167      * `unwind_indent` before issuing BLOCK-END.
1168      * Scanners for block, flow, and plain scalars need to be modified.
1169      * </pre>
1170      */
1171     private void scanToNextToken() {
1172         // If there is a byte order mark (BOM) at the beginning of the stream,
1173         // forward past it.
1174         if (reader.getIndex() == 0 && reader.peek() == '\uFEFF') {
1175             reader.forward();
1176         }
1177         boolean found = false;
1178         while (!found) {
1179             int ff = 0;
1180             // Peek ahead until we find the first non-space character, then
1181             // move forward directly to that character.
1182             while (reader.peek(ff) == ' ') {
1183                 ff++;
1184             }
1185             if (ff > 0) {
1186                 reader.forward(ff);
1187             }
1188             // If the character we have skipped forward to is a comment (#),
1189             // then peek ahead until we find the next end of line. YAML
1190             // comments are from a # to the next new-line. We then forward
1191             // past the comment.
1192             if (reader.peek() == '#') {
1193                 ff = 0;
1194                 while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1195                     ff++;
1196                 }
1197                 if (ff > 0) {
1198                     reader.forward(ff);
1199                 }
1200             }
1201             // If we scanned a line break, then (depending on flow level),
1202             // simple keys may be allowed.
1203             if (scanLineBreak().length() != 0) {// found a line-break
1204                 if (this.flowLevel == 0) {
1205                     // Simple keys are allowed at flow-level 0 after a line
1206                     // break
1207                     this.allowSimpleKey = true;
1208                 }
1209             } else {
1210                 found = true;
1211             }
1212         }
1213     }
1214 
1215     @SuppressWarnings({ "unchecked", "rawtypes" })
1216     private Token scanDirective() {
1217         // See the specification for details.
1218         Mark startMark = reader.getMark();
1219         Mark endMark;
1220         reader.forward();
1221         String name = scanDirectiveName(startMark);
1222         List<?> value = null;
1223         if ("YAML".equals(name)) {
1224             value = scanYamlDirectiveValue(startMark);
1225             endMark = reader.getMark();
1226         } else if ("TAG".equals(name)) {
1227             value = scanTagDirectiveValue(startMark);
1228             endMark = reader.getMark();
1229         } else {
1230             endMark = reader.getMark();
1231             int ff = 0;
1232             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1233                 ff++;
1234             }
1235             if (ff > 0) {
1236                 reader.forward(ff);
1237             }
1238         }
1239         scanDirectiveIgnoredLine(startMark);
1240         return new DirectiveToken(name, value, startMark, endMark);
1241     }
1242 
1243     /**
1244      * Scan a directive name. Directive names are a series of non-space
1245      * characters.
1246      * 
1247      * @see http://www.yaml.org/spec/1.1/#id895217
1248      */
1249     private String scanDirectiveName(Mark startMark) {
1250         // See the specification for details.
1251         int length = 0;
1252         // A Directive-name is a sequence of alphanumeric characters
1253         // (a-z,A-Z,0-9). We scan until we find something that isn't.
1254         // FIXME this disagrees with the specification.
1255         char ch = reader.peek(length);
1256         while (Constant.ALPHA.has(ch)) {
1257             length++;
1258             ch = reader.peek(length);
1259         }
1260         // If the name would be empty, an error occurs.
1261         if (length == 0) {
1262             throw new ScannerException("while scanning a directive", startMark,
1263                     "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch)
1264                             + ")", reader.getMark());
1265         }
1266         String value = reader.prefixForward(length);
1267         ch = reader.peek();
1268         if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1269             throw new ScannerException("while scanning a directive", startMark,
1270                     "expected alphabetic or numeric character, but found " + ch + "(" + ((int) ch)
1271                             + ")", reader.getMark());
1272         }
1273         return value;
1274     }
1275 
1276     private List<Integer> scanYamlDirectiveValue(Mark startMark) {
1277         // See the specification for details.
1278         while (reader.peek() == ' ') {
1279             reader.forward();
1280         }
1281         Integer major = scanYamlDirectiveNumber(startMark);
1282         if (reader.peek() != '.') {
1283             throw new ScannerException("while scanning a directive", startMark,
1284                     "expected a digit or '.', but found " + reader.peek() + "("
1285                             + ((int) reader.peek()) + ")", reader.getMark());
1286         }
1287         reader.forward();
1288         Integer minor = scanYamlDirectiveNumber(startMark);
1289         if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) {
1290             throw new ScannerException("while scanning a directive", startMark,
1291                     "expected a digit or ' ', but found " + reader.peek() + "("
1292                             + ((int) reader.peek()) + ")", reader.getMark());
1293         }
1294         List<Integer> result = new ArrayList<Integer>(2);
1295         result.add(major);
1296         result.add(minor);
1297         return result;
1298     }
1299 
1300     /**
1301      * Read a %YAML directive number: this is either the major or the minor
1302      * part. Stop reading at a non-digit character (usually either '.' or '\n').
1303      * 
1304      * @see http://www.yaml.org/spec/1.1/#id895631
1305      * @see http://www.yaml.org/spec/1.1/#ns-dec-digit
1306      */
1307     private Integer scanYamlDirectiveNumber(Mark startMark) {
1308         // See the specification for details.
1309         char ch = reader.peek();
1310         if (!Character.isDigit(ch)) {
1311             throw new ScannerException("while scanning a directive", startMark,
1312                     "expected a digit, but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
1313         }
1314         int length = 0;
1315         while (Character.isDigit(reader.peek(length))) {
1316             length++;
1317         }
1318         Integer value = Integer.parseInt(reader.prefixForward(length));
1319         return value;
1320     }
1321 
1322     /**
1323      * <p>
1324      * Read a %TAG directive value:
1325      * 
1326      * <pre>
1327      * s-ignored-space+ c-tag-handle s-ignored-space+ ns-tag-prefix s-l-comments
1328      * </pre>
1329      * 
1330      * </p>
1331      * 
1332      * @see http://www.yaml.org/spec/1.1/#id896044
1333      */
1334     private List<String> scanTagDirectiveValue(Mark startMark) {
1335         // See the specification for details.
1336         while (reader.peek() == ' ') {
1337             reader.forward();
1338         }
1339         String handle = scanTagDirectiveHandle(startMark);
1340         while (reader.peek() == ' ') {
1341             reader.forward();
1342         }
1343         String prefix = scanTagDirectivePrefix(startMark);
1344         List<String> result = new ArrayList<String>(2);
1345         result.add(handle);
1346         result.add(prefix);
1347         return result;
1348     }
1349 
1350     /**
1351      * Scan a %TAG directive's handle. This is YAML's c-tag-handle.
1352      * 
1353      * @see http://www.yaml.org/spec/1.1/#id896876
1354      * @param startMark
1355      * @return
1356      */
1357     private String scanTagDirectiveHandle(Mark startMark) {
1358         // See the specification for details.
1359         String value = scanTagHandle("directive", startMark);
1360         char ch = reader.peek();
1361         if (ch != ' ') {
1362             throw new ScannerException("while scanning a directive", startMark,
1363                     "expected ' ', but found " + reader.peek() + "(" + ch + ")", reader.getMark());
1364         }
1365         return value;
1366     }
1367 
1368     /**
1369      * Scan a %TAG directive's prefix. This is YAML's ns-tag-prefix.
1370      * 
1371      * @see http://www.yaml.org/spec/1.1/#ns-tag-prefix
1372      */
1373     private String scanTagDirectivePrefix(Mark startMark) {
1374         // See the specification for details.
1375         String value = scanTagUri("directive", startMark);
1376         if (Constant.NULL_BL_LINEBR.hasNo(reader.peek())) {
1377             throw new ScannerException("while scanning a directive", startMark,
1378                     "expected ' ', but found " + reader.peek() + "(" + ((int) reader.peek()) + ")",
1379                     reader.getMark());
1380         }
1381         return value;
1382     }
1383 
1384     private String scanDirectiveIgnoredLine(Mark startMark) {
1385         // See the specification for details.
1386         int ff = 0;
1387         while (reader.peek(ff) == ' ') {
1388             ff++;
1389         }
1390         if (ff > 0) {
1391             reader.forward(ff);
1392         }
1393         if (reader.peek() == '#') {
1394             ff = 0;
1395             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1396                 ff++;
1397             }
1398             reader.forward(ff);
1399         }
1400         char ch = reader.peek();
1401         String lineBreak = scanLineBreak();
1402         if (lineBreak.length() == 0 && ch != '\0') {
1403             throw new ScannerException("while scanning a directive", startMark,
1404                     "expected a comment or a line break, but found " + ch + "(" + ((int) ch) + ")",
1405                     reader.getMark());
1406         }
1407         return lineBreak;
1408     }
1409 
1410     /**
1411      * <pre>
1412      * The specification does not restrict characters for anchors and
1413      * aliases. This may lead to problems, for instance, the document:
1414      *   [ *alias, value ]
1415      * can be interpreted in two ways, as
1416      *   [ &quot;value&quot; ]
1417      * and
1418      *   [ *alias , &quot;value&quot; ]
1419      * Therefore we restrict aliases to numbers and ASCII letters.
1420      * </pre>
1421      */
1422     private Token scanAnchor(boolean isAnchor) {
1423         Mark startMark = reader.getMark();
1424         char indicator = reader.peek();
1425         String name = indicator == '*' ? "alias" : "anchor";
1426         reader.forward();
1427         int length = 0;
1428         char ch = reader.peek(length);
1429         while (Constant.ALPHA.has(ch)) {
1430             length++;
1431             ch = reader.peek(length);
1432         }
1433         if (length == 0) {
1434             throw new ScannerException("while scanning an " + name, startMark,
1435                     "expected alphabetic or numeric character, but found but found " + ch,
1436                     reader.getMark());
1437         }
1438         String value = reader.prefixForward(length);
1439         ch = reader.peek();
1440         if (Constant.NULL_BL_T_LINEBR.hasNo(ch, "?:,]}%@`")) {
1441             throw new ScannerException("while scanning an " + name, startMark,
1442                     "expected alphabetic or numeric character, but found " + ch + "("
1443                             + ((int) reader.peek()) + ")", reader.getMark());
1444         }
1445         Mark endMark = reader.getMark();
1446         Token tok;
1447         if (isAnchor) {
1448             tok = new AnchorToken(value, startMark, endMark);
1449         } else {
1450             tok = new AliasToken(value, startMark, endMark);
1451         }
1452         return tok;
1453     }
1454 
1455     /**
1456      * <p>
1457      * Scan a Tag property. A Tag property may be specified in one of three
1458      * ways: c-verbatim-tag, c-ns-shorthand-tag, or c-ns-non-specific-tag
1459      * </p>
1460      * 
1461      * <p>
1462      * c-verbatim-tag takes the form !&lt;ns-uri-char+&gt; and must be delivered
1463      * verbatim (as-is) to the application. In particular, verbatim tags are not
1464      * subject to tag resolution.
1465      * </p>
1466      * 
1467      * <p>
1468      * c-ns-shorthand-tag is a valid tag handle followed by a non-empty suffix.
1469      * If the tag handle is a c-primary-tag-handle ('!') then the suffix must
1470      * have all exclamation marks properly URI-escaped (%21); otherwise, the
1471      * string will look like a named tag handle: !foo!bar would be interpreted
1472      * as (handle="!foo!", suffix="bar").
1473      * </p>
1474      * 
1475      * <p>
1476      * c-ns-non-specific-tag is always a lone '!'; this is only useful for plain
1477      * scalars, where its specification means that the scalar MUST be resolved
1478      * to have type tag:yaml.org,2002:str.
1479      * </p>
1480      * 
1481      * TODO SnakeYaml incorrectly ignores c-ns-non-specific-tag right now.
1482      * 
1483      * @see http://www.yaml.org/spec/1.1/#id900262
1484      * 
1485      *      TODO Note that this method does not enforce rules about local versus
1486      *      global tags!
1487      */
1488     private Token scanTag() {
1489         // See the specification for details.
1490         Mark startMark = reader.getMark();
1491         // Determine the type of tag property based on the first character
1492         // encountered
1493         char ch = reader.peek(1);
1494         String handle = null;
1495         String suffix = null;
1496         // Verbatim tag! (c-verbatim-tag)
1497         if (ch == '<') {
1498             // Skip the exclamation mark and &gt;, then read the tag suffix (as
1499             // a URI).
1500             reader.forward(2);
1501             suffix = scanTagUri("tag", startMark);
1502             if (reader.peek() != '>') {
1503                 // If there are any characters between the end of the tag-suffix
1504                 // URI and the closing &gt;, then an error has occurred.
1505                 throw new ScannerException("while scanning a tag", startMark,
1506                         "expected '>', but found '" + reader.peek() + "' (" + ((int) reader.peek())
1507                                 + ")", reader.getMark());
1508             }
1509             reader.forward();
1510         } else if (Constant.NULL_BL_T_LINEBR.has(ch)) {
1511             // A NUL, blank, tab, or line-break means that this was a
1512             // c-ns-non-specific tag.
1513             suffix = "!";
1514             reader.forward();
1515         } else {
1516             // Any other character implies c-ns-shorthand-tag type.
1517 
1518             // Look ahead in the stream to determine whether this tag property
1519             // is of the form !foo or !foo!bar.
1520             int length = 1;
1521             boolean useHandle = false;
1522             while (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1523                 if (ch == '!') {
1524                     useHandle = true;
1525                     break;
1526                 }
1527                 length++;
1528                 ch = reader.peek(length);
1529             }
1530             handle = "!";
1531             // If we need to use a handle, scan it in; otherwise, the handle is
1532             // presumed to be '!'.
1533             if (useHandle) {
1534                 handle = scanTagHandle("tag", startMark);
1535             } else {
1536                 handle = "!";
1537                 reader.forward();
1538             }
1539             suffix = scanTagUri("tag", startMark);
1540         }
1541         ch = reader.peek();
1542         // Check that the next character is allowed to follow a tag-property;
1543         // if it is not, raise the error.
1544         if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1545             throw new ScannerException("while scanning a tag", startMark,
1546                     "expected ' ', but found '" + ch + "' (" + ((int) ch) + ")", reader.getMark());
1547         }
1548         TagTuple value = new TagTuple(handle, suffix);
1549         Mark endMark = reader.getMark();
1550         return new TagToken(value, startMark, endMark);
1551     }
1552 
1553     private Token scanBlockScalar(char style) {
1554         // See the specification for details.
1555         boolean folded;
1556         // Depending on the given style, we determine whether the scalar is
1557         // folded ('>') or literal ('|')
1558         if (style == '>') {
1559             folded = true;
1560         } else {
1561             folded = false;
1562         }
1563         StringBuilder chunks = new StringBuilder();
1564         Mark startMark = reader.getMark();
1565         // Scan the header.
1566         reader.forward();
1567         Chomping chompi = scanBlockScalarIndicators(startMark);
1568         int increment = chompi.getIncrement();
1569         scanBlockScalarIgnoredLine(startMark);
1570 
1571         // Determine the indentation level and go to the first non-empty line.
1572         int minIndent = this.indent + 1;
1573         if (minIndent < 1) {
1574             minIndent = 1;
1575         }
1576         String breaks = null;
1577         int maxIndent = 0;
1578         int indent = 0;
1579         Mark endMark;
1580         if (increment == -1) {
1581             Object[] brme = scanBlockScalarIndentation();
1582             breaks = (String) brme[0];
1583             maxIndent = ((Integer) brme[1]).intValue();
1584             endMark = (Mark) brme[2];
1585             indent = Math.max(minIndent, maxIndent);
1586         } else {
1587             indent = minIndent + increment - 1;
1588             Object[] brme = scanBlockScalarBreaks(indent);
1589             breaks = (String) brme[0];
1590             endMark = (Mark) brme[1];
1591         }
1592 
1593         String lineBreak = "";
1594 
1595         // Scan the inner part of the block scalar.
1596         while (this.reader.getColumn() == indent && reader.peek() != '\0') {
1597             chunks.append(breaks);
1598             boolean leadingNonSpace = " \t".indexOf(reader.peek()) == -1;
1599             int length = 0;
1600             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(length))) {
1601                 length++;
1602             }
1603             chunks.append(reader.prefixForward(length));
1604             lineBreak = scanLineBreak();
1605             Object[] brme = scanBlockScalarBreaks(indent);
1606             breaks = (String) brme[0];
1607             endMark = (Mark) brme[1];
1608             if (this.reader.getColumn() == indent && reader.peek() != '\0') {
1609 
1610                 // Unfortunately, folding rules are ambiguous.
1611                 //
1612                 // This is the folding according to the specification:
1613                 if (folded && "\n".equals(lineBreak) && leadingNonSpace
1614                         && " \t".indexOf(reader.peek()) == -1) {
1615                     if (breaks.length() == 0) {
1616                         chunks.append(" ");
1617                     }
1618                 } else {
1619                     chunks.append(lineBreak);
1620                 }
1621                 // Clark Evans's interpretation (also in the spec examples) not
1622                 // imported from PyYAML
1623             } else {
1624                 break;
1625             }
1626         }
1627         // Chomp the tail.
1628         if (chompi.chompTailIsNotFalse()) {
1629             chunks.append(lineBreak);
1630         }
1631         if (chompi.chompTailIsTrue()) {
1632             chunks.append(breaks);
1633         }
1634         // We are done.
1635         return new ScalarToken(chunks.toString(), false, startMark, endMark, style);
1636     }
1637 
1638     /**
1639      * Scan a block scalar indicator. The block scalar indicator includes two
1640      * optional components, which may appear in either order.
1641      * 
1642      * A block indentation indicator is a non-zero digit describing the
1643      * indentation level of the block scalar to follow. This indentation is an
1644      * additional number of spaces relative to the current indentation level.
1645      * 
1646      * A block chomping indicator is a + or -, selecting the chomping mode away
1647      * from the default (clip) to either -(strip) or +(keep).
1648      * 
1649      * @see http://www.yaml.org/spec/1.1/#id868988
1650      * @see http://www.yaml.org/spec/1.1/#id927035
1651      * @see http://www.yaml.org/spec/1.1/#id927557
1652      */
1653     private Chomping scanBlockScalarIndicators(Mark startMark) {
1654         // See the specification for details.
1655         Boolean chomping = null;
1656         int increment = -1;
1657         char ch = reader.peek();
1658         if (ch == '-' || ch == '+') {
1659             if (ch == '+') {
1660                 chomping = Boolean.TRUE;
1661             } else {
1662                 chomping = Boolean.FALSE;
1663             }
1664             reader.forward();
1665             ch = reader.peek();
1666             if (Character.isDigit(ch)) {
1667                 increment = Integer.parseInt(String.valueOf(ch));
1668                 if (increment == 0) {
1669                     throw new ScannerException("while scanning a block scalar", startMark,
1670                             "expected indentation indicator in the range 1-9, but found 0",
1671                             reader.getMark());
1672                 }
1673                 reader.forward();
1674             }
1675         } else if (Character.isDigit(ch)) {
1676             increment = Integer.parseInt(String.valueOf(ch));
1677             if (increment == 0) {
1678                 throw new ScannerException("while scanning a block scalar", startMark,
1679                         "expected indentation indicator in the range 1-9, but found 0",
1680                         reader.getMark());
1681             }
1682             reader.forward();
1683             ch = reader.peek();
1684             if (ch == '-' || ch == '+') {
1685                 if (ch == '+') {
1686                     chomping = Boolean.TRUE;
1687                 } else {
1688                     chomping = Boolean.FALSE;
1689                 }
1690                 reader.forward();
1691             }
1692         }
1693         ch = reader.peek();
1694         if (Constant.NULL_BL_LINEBR.hasNo(ch)) {
1695             throw new ScannerException("while scanning a block scalar", startMark,
1696                     "expected chomping or indentation indicators, but found " + ch,
1697                     reader.getMark());
1698         }
1699         return new Chomping(chomping, increment);
1700     }
1701 
1702     /**
1703      * Scan to the end of the line after a block scalar has been scanned; the
1704      * only things that are permitted at this time are comments and spaces.
1705      */
1706     private String scanBlockScalarIgnoredLine(Mark startMark) {
1707         // See the specification for details.
1708         int ff = 0;
1709         // Forward past any number of trailing spaces
1710         while (reader.peek(ff) == ' ') {
1711             ff++;
1712         }
1713         if (ff > 0) {
1714             reader.forward(ff);
1715         }
1716         // If a comment occurs, scan to just before the end of line.
1717         if (reader.peek() == '#') {
1718             ff = 0;
1719             while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(ff))) {
1720                 ff++;
1721             }
1722             if (ff > 0) {
1723                 reader.forward(ff);
1724             }
1725         }
1726         // If the next character is not a null or line break, an error has
1727         // occurred.
1728         char ch = reader.peek();
1729         String lineBreak = scanLineBreak();
1730         if (lineBreak.length() == 0 && ch != '\0') {
1731             throw new ScannerException("while scanning a block scalar", startMark,
1732                     "expected a comment or a line break, but found " + ch, reader.getMark());
1733         }
1734         return lineBreak;
1735     }
1736 
1737     /**
1738      * Scans for the indentation of a block scalar implicitly. This mechanism is
1739      * used only if the block did not explicitly state an indentation to be
1740      * used.
1741      * 
1742      * @see http://www.yaml.org/spec/1.1/#id927035
1743      */
1744     private Object[] scanBlockScalarIndentation() {
1745         // See the specification for details.
1746         StringBuilder chunks = new StringBuilder();
1747         int maxIndent = 0;
1748         Mark endMark = reader.getMark();
1749         // Look ahead some number of lines until the first non-blank character
1750         // occurs; the determined indentation will be the maximum number of
1751         // leading spaces on any of these lines.
1752         while (Constant.LINEBR.has(reader.peek(), " \r")) {
1753             if (reader.peek() != ' ') {
1754                 // If the character isn't a space, it must be some kind of
1755                 // line-break; scan the line break and track it.
1756                 chunks.append(scanLineBreak());
1757                 endMark = reader.getMark();
1758             } else {
1759                 // If the character is a space, move forward to the next
1760                 // character; if we surpass our previous maximum for indent
1761                 // level, update that too.
1762                 reader.forward();
1763                 if (this.reader.getColumn() > maxIndent) {
1764                     maxIndent = reader.getColumn();
1765                 }
1766             }
1767         }
1768         // Pass several results back together.
1769         return new Object[] { chunks.toString(), maxIndent, endMark };
1770     }
1771 
1772     private Object[] scanBlockScalarBreaks(int indent) {
1773         // See the specification for details.
1774         StringBuilder chunks = new StringBuilder();
1775         Mark endMark = reader.getMark();
1776         int ff = 0;
1777         int col = this.reader.getColumn();
1778         // Scan for up to the expected indentation-level of spaces, then move
1779         // forward past that amount.
1780         while (col < indent && reader.peek(ff) == ' ') {
1781             ff++;
1782             col++;
1783         }
1784         if (ff > 0) {
1785             reader.forward(ff);
1786         }
1787         // Consume one or more line breaks followed by any amount of spaces,
1788         // until we find something that isn't a line-break.
1789         String lineBreak = null;
1790         while ((lineBreak = scanLineBreak()).length() != 0) {
1791             chunks.append(lineBreak);
1792             endMark = reader.getMark();
1793             // Scan past up to (indent) spaces on the next line, then forward
1794             // past them.
1795             ff = 0;
1796             col = this.reader.getColumn();
1797             while (col < indent && reader.peek(ff) == ' ') {
1798                 ff++;
1799                 col++;
1800             }
1801             if (ff > 0) {
1802                 reader.forward(ff);
1803             }
1804         }
1805         // Return both the assembled intervening string and the end-mark.
1806         return new Object[] { chunks.toString(), endMark };
1807     }
1808 
1809     /**
1810      * Scan a flow-style scalar. Flow scalars are presented in one of two forms;
1811      * first, a flow scalar may be a double-quoted string; second, a flow scalar
1812      * may be a single-quoted string.
1813      * 
1814      * @see http://www.yaml.org/spec/1.1/#flow style/syntax
1815      * 
1816      *      <pre>
1817      * See the specification for details.
1818      * Note that we loose indentation rules for quoted scalars. Quoted
1819      * scalars don't need to adhere indentation because &quot; and ' clearly
1820      * mark the beginning and the end of them. Therefore we are less
1821      * restrictive then the specification requires. We only need to check
1822      * that document separators are not included in scalars.
1823      * </pre>
1824      */
1825     private Token scanFlowScalar(char style) {
1826         boolean _double;
1827         // The style will be either single- or double-quoted; we determine this
1828         // by the first character in the entry (supplied)
1829         if (style == '"') {
1830             _double = true;
1831         } else {
1832             _double = false;
1833         }
1834         StringBuilder chunks = new StringBuilder();
1835         Mark startMark = reader.getMark();
1836         char quote = reader.peek();
1837         reader.forward();
1838         chunks.append(scanFlowScalarNonSpaces(_double, startMark));
1839         while (reader.peek() != quote) {
1840             chunks.append(scanFlowScalarSpaces(startMark));
1841             chunks.append(scanFlowScalarNonSpaces(_double, startMark));
1842         }
1843         reader.forward();
1844         Mark endMark = reader.getMark();
1845         return new ScalarToken(chunks.toString(), false, startMark, endMark, style);
1846     }
1847 
1848     /**
1849      * Scan some number of flow-scalar non-space characters.
1850      */
1851     private String scanFlowScalarNonSpaces(boolean doubleQuoted, Mark startMark) {
1852         // See the specification for details.
1853         StringBuilder chunks = new StringBuilder();
1854         while (true) {
1855             // Scan through any number of characters which are not: NUL, blank,
1856             // tabs, line breaks, single-quotes, double-quotes, or backslashes.
1857             int length = 0;
1858             while (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length), "\'\"\\")) {
1859                 length++;
1860             }
1861             if (length != 0) {
1862                 chunks.append(reader.prefixForward(length));
1863             }
1864             // Depending on our quoting-type, the characters ', " and \ have
1865             // differing meanings.
1866             char ch = reader.peek();
1867             if (!doubleQuoted && ch == '\'' && reader.peek(1) == '\'') {
1868                 chunks.append("'");
1869                 reader.forward(2);
1870             } else if ((doubleQuoted && ch == '\'') || (!doubleQuoted && "\"\\".indexOf(ch) != -1)) {
1871                 chunks.append(ch);
1872                 reader.forward();
1873             } else if (doubleQuoted && ch == '\\') {
1874                 reader.forward();
1875                 ch = reader.peek();
1876                 if (ESCAPE_REPLACEMENTS.containsKey(Character.valueOf(ch))) {
1877                     // The character is one of the single-replacement
1878                     // types; these are replaced with a literal character
1879                     // from the mapping.
1880                     chunks.append(ESCAPE_REPLACEMENTS.get(Character.valueOf(ch)));
1881                     reader.forward();
1882                 } else if (ESCAPE_CODES.containsKey(Character.valueOf(ch))) {
1883                     // The character is a multi-digit escape sequence, with
1884                     // length defined by the value in the ESCAPE_CODES map.
1885                     length = (ESCAPE_CODES.get(Character.valueOf(ch))).intValue();
1886                     reader.forward();
1887                     String hex = reader.prefix(length);
1888                     if (NOT_HEXA.matcher(hex).find()) {
1889                         throw new ScannerException("while scanning a double-quoted scalar",
1890                                 startMark, "expected escape sequence of " + length
1891                                         + " hexadecimal numbers, but found: " + hex,
1892                                 reader.getMark());
1893                     }
1894                     int decimal = Integer.parseInt(hex, 16);
1895                     String unicode = new String(Character.toChars(decimal));
1896                     chunks.append(unicode);
1897                     reader.forward(length);
1898                 } else if (scanLineBreak().length() != 0) {
1899                     chunks.append(scanFlowScalarBreaks(startMark));
1900                 } else {
1901                     throw new ScannerException("while scanning a double-quoted scalar", startMark,
1902                             "found unknown escape character " + ch + "(" + ((int) ch) + ")",
1903                             reader.getMark());
1904                 }
1905             } else {
1906                 return chunks.toString();
1907             }
1908         }
1909     }
1910 
1911     private String scanFlowScalarSpaces(Mark startMark) {
1912         // See the specification for details.
1913         StringBuilder chunks = new StringBuilder();
1914         int length = 0;
1915         // Scan through any number of whitespace (space, tab) characters,
1916         // consuming them.
1917         while (" \t".indexOf(reader.peek(length)) != -1) {
1918             length++;
1919         }
1920         String whitespaces = reader.prefixForward(length);
1921         char ch = reader.peek();
1922         if (ch == '\0') {
1923             // A flow scalar cannot end with an end-of-stream
1924             throw new ScannerException("while scanning a quoted scalar", startMark,
1925                     "found unexpected end of stream", reader.getMark());
1926         }
1927         // If we encounter a line break, scan it into our assembled string...
1928         String lineBreak = scanLineBreak();
1929         if (lineBreak.length() != 0) {
1930             String breaks = scanFlowScalarBreaks(startMark);
1931             if (!"\n".equals(lineBreak)) {
1932                 chunks.append(lineBreak);
1933             } else if (breaks.length() == 0) {
1934                 chunks.append(" ");
1935             }
1936             chunks.append(breaks);
1937         } else {
1938             chunks.append(whitespaces);
1939         }
1940         return chunks.toString();
1941     }
1942 
1943     private String scanFlowScalarBreaks(Mark startMark) {
1944         // See the specification for details.
1945         StringBuilder chunks = new StringBuilder();
1946         while (true) {
1947             // Instead of checking indentation, we check for document
1948             // separators.
1949             String prefix = reader.prefix(3);
1950             if (("---".equals(prefix) || "...".equals(prefix))
1951                     && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
1952                 throw new ScannerException("while scanning a quoted scalar", startMark,
1953                         "found unexpected document separator", reader.getMark());
1954             }
1955             // Scan past any number of spaces and tabs, ignoring them
1956             while (" \t".indexOf(reader.peek()) != -1) {
1957                 reader.forward();
1958             }
1959             // If we stopped at a line break, add that; otherwise, return the
1960             // assembled set of scalar breaks.
1961             String lineBreak = scanLineBreak();
1962             if (lineBreak.length() != 0) {
1963                 chunks.append(lineBreak);
1964             } else {
1965                 return chunks.toString();
1966             }
1967         }
1968     }
1969 
1970     /**
1971      * Scan a plain scalar.
1972      * 
1973      * <pre>
1974      * See the specification for details.
1975      * We add an additional restriction for the flow context:
1976      *   plain scalars in the flow context cannot contain ',', ':' and '?'.
1977      * We also keep track of the `allow_simple_key` flag here.
1978      * Indentation rules are loosed for the flow context.
1979      * </pre>
1980      */
1981     private Token scanPlain() {
1982         StringBuilder chunks = new StringBuilder();
1983         Mark startMark = reader.getMark();
1984         Mark endMark = startMark;
1985         int indent = this.indent + 1;
1986         String spaces = "";
1987         while (true) {
1988             char ch;
1989             int length = 0;
1990             // A comment indicates the end of the scalar.
1991             if (reader.peek() == '#') {
1992                 break;
1993             }
1994             while (true) {
1995                 ch = reader.peek(length);
1996                 if (Constant.NULL_BL_T_LINEBR.has(ch)
1997                         || (this.flowLevel == 0 && ch == ':' && Constant.NULL_BL_T_LINEBR
1998                                 .has(reader.peek(length + 1)))
1999                         || (this.flowLevel != 0 && ",:?[]{}".indexOf(ch) != -1)) {
2000                     break;
2001                 }
2002                 length++;
2003             }
2004             // It's not clear what we should do with ':' in the flow context.
2005             if (this.flowLevel != 0 && ch == ':'
2006                     && Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length + 1), ",[]{}")) {
2007                 reader.forward(length);
2008                 throw new ScannerException("while scanning a plain scalar", startMark,
2009                         "found unexpected ':'", reader.getMark(),
2010                         "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.");
2011             }
2012             if (length == 0) {
2013                 break;
2014             }
2015             this.allowSimpleKey = false;
2016             chunks.append(spaces);
2017             chunks.append(reader.prefixForward(length));
2018             endMark = reader.getMark();
2019             spaces = scanPlainSpaces();
2020             // System.out.printf("spaces[%s]\n", spaces);
2021             if (spaces.length() == 0 || reader.peek() == '#'
2022                     || (this.flowLevel == 0 && this.reader.getColumn() < indent)) {
2023                 break;
2024             }
2025         }
2026         return new ScalarToken(chunks.toString(), startMark, endMark, true);
2027     }
2028 
2029     /**
2030      * See the specification for details. SnakeYAML and libyaml allow tabs
2031      * inside plain scalar
2032      */
2033     private String scanPlainSpaces() {
2034         int length = 0;
2035         while (reader.peek(length) == ' ' || reader.peek(length) == '\t') {
2036             length++;
2037         }
2038         String whitespaces = reader.prefixForward(length);
2039         String lineBreak = scanLineBreak();
2040         if (lineBreak.length() != 0) {
2041             this.allowSimpleKey = true;
2042             String prefix = reader.prefix(3);
2043             if ("---".equals(prefix) || "...".equals(prefix)
2044                     && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
2045                 return "";
2046             }
2047             StringBuilder breaks = new StringBuilder();
2048             while (true) {
2049                 if (reader.peek() == ' ') {
2050                     reader.forward();
2051                 } else {
2052                     String lb = scanLineBreak();
2053                     if (lb.length() != 0) {
2054                         breaks.append(lb);
2055                         prefix = reader.prefix(3);
2056                         if ("---".equals(prefix) || "...".equals(prefix)
2057                                 && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
2058                             return "";
2059                         }
2060                     } else {
2061                         break;
2062                     }
2063                 }
2064             }
2065             if (!"\n".equals(lineBreak)) {
2066                 return lineBreak + breaks;
2067             } else if (breaks.length() == 0) {
2068                 return " ";
2069             }
2070             return breaks.toString();
2071         }
2072         return whitespaces;
2073     }
2074 
2075     /**
2076      * <p>
2077      * Scan a Tag handle. A Tag handle takes one of three forms:
2078      * 
2079      * <pre>
2080      * "!" (c-primary-tag-handle)
2081      * "!!" (ns-secondary-tag-handle)
2082      * "!(name)!" (c-named-tag-handle)
2083      * </pre>
2084      * 
2085      * Where (name) must be formatted as an ns-word-char.
2086      * </p>
2087      * 
2088      * @see http://www.yaml.org/spec/1.1/#c-tag-handle
2089      * @see http://www.yaml.org/spec/1.1/#ns-word-char
2090      * 
2091      *      <pre>
2092      * See the specification for details.
2093      * For some strange reasons, the specification does not allow '_' in
2094      * tag handles. I have allowed it anyway.
2095      * </pre>
2096      */
2097     private String scanTagHandle(String name, Mark startMark) {
2098         char ch = reader.peek();
2099         if (ch != '!') {
2100             throw new ScannerException("while scanning a " + name, startMark,
2101                     "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
2102         }
2103         // Look for the next '!' in the stream, stopping if we hit a
2104         // non-word-character. If the first character is a space, then the
2105         // tag-handle is a c-primary-tag-handle ('!').
2106         int length = 1;
2107         ch = reader.peek(length);
2108         if (ch != ' ') {
2109             // Scan through 0+ alphabetic characters.
2110             // FIXME According to the specification, these should be
2111             // ns-word-char only, which prohibits '_'. This might be a
2112             // candidate for a configuration option.
2113             while (Constant.ALPHA.has(ch)) {
2114                 length++;
2115                 ch = reader.peek(length);
2116             }
2117             // Found the next non-word-char. If this is not a space and not an
2118             // '!', then this is an error, as the tag-handle was specified as:
2119             // !(name) or similar; the trailing '!' is missing.
2120             if (ch != '!') {
2121                 reader.forward(length);
2122                 throw new ScannerException("while scanning a " + name, startMark,
2123                         "expected '!', but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
2124             }
2125             length++;
2126         }
2127         String value = reader.prefixForward(length);
2128         return value;
2129     }
2130 
2131     /**
2132      * <p>
2133      * Scan a Tag URI. This scanning is valid for both local and global tag
2134      * directives, because both appear to be valid URIs as far as scanning is
2135      * concerned. The difference may be distinguished later, in parsing. This
2136      * method will scan for ns-uri-char*, which covers both cases.
2137      * </p>
2138      * 
2139      * <p>
2140      * This method performs no verification that the scanned URI conforms to any
2141      * particular kind of URI specification.
2142      * </p>
2143      * 
2144      * @see http://www.yaml.org/spec/1.1/#ns-uri-char
2145      */
2146     private String scanTagUri(String name, Mark startMark) {
2147         // See the specification for details.
2148         // Note: we do not check if URI is well-formed.
2149         StringBuilder chunks = new StringBuilder();
2150         // Scan through accepted URI characters, which includes the standard
2151         // URI characters, plus the start-escape character ('%'). When we get
2152         // to a start-escape, scan the escaped sequence, then return.
2153         int length = 0;
2154         char ch = reader.peek(length);
2155         while (Constant.URI_CHARS.has(ch)) {
2156             if (ch == '%') {
2157                 chunks.append(reader.prefixForward(length));
2158                 length = 0;
2159                 chunks.append(scanUriEscapes(name, startMark));
2160             } else {
2161                 length++;
2162             }
2163             ch = reader.peek(length);
2164         }
2165         // Consume the last "chunk", which would not otherwise be consumed by
2166         // the loop above.
2167         if (length != 0) {
2168             chunks.append(reader.prefixForward(length));
2169             length = 0;
2170         }
2171         if (chunks.length() == 0) {
2172             // If no URI was found, an error has occurred.
2173             throw new ScannerException("while scanning a " + name, startMark,
2174                     "expected URI, but found " + ch + "(" + ((int) ch) + ")", reader.getMark());
2175         }
2176         return chunks.toString();
2177     }
2178 
2179     /**
2180      * <p>
2181      * Scan a sequence of %-escaped URI escape codes and convert them into a
2182      * String representing the unescaped values.
2183      * </p>
2184      * 
2185      * FIXME This method fails for more than 256 bytes' worth of URI-encoded
2186      * characters in a row. Is this possible? Is this a use-case?
2187      * 
2188      * @see http://www.ietf.org/rfc/rfc2396.txt, section 2.4, Escaped Encoding.
2189      */
2190     private String scanUriEscapes(String name, Mark startMark) {
2191         // First, look ahead to see how many URI-escaped characters we should
2192         // expect, so we can use the correct buffer size.
2193         int length = 1;
2194         while (reader.peek(length * 3) == '%') {
2195             length++;
2196         }
2197         // See the specification for details.
2198         // URIs containing 16 and 32 bit Unicode characters are
2199         // encoded in UTF-8, and then each octet is written as a
2200         // separate character.
2201         Mark beginningMark = reader.getMark();
2202         ByteBuffer buff = ByteBuffer.allocate(length);
2203         while (reader.peek() == '%') {
2204             reader.forward();
2205             try {
2206                 byte code = (byte) Integer.parseInt(reader.prefix(2), 16);
2207                 buff.put(code);
2208             } catch (NumberFormatException nfe) {
2209                 throw new ScannerException("while scanning a " + name, startMark,
2210                         "expected URI escape sequence of 2 hexadecimal numbers, but found "
2211                                 + reader.peek() + "(" + ((int) reader.peek()) + ") and "
2212                                 + reader.peek(1) + "(" + ((int) reader.peek(1)) + ")",
2213                         reader.getMark());
2214             }
2215             reader.forward(2);
2216         }
2217         buff.flip();
2218         try {
2219             return UriEncoder.decode(buff);
2220         } catch (CharacterCodingException e) {
2221             throw new ScannerException("while scanning a " + name, startMark,
2222                     "expected URI in UTF-8: " + e.getMessage(), beginningMark);
2223         }
2224     }
2225 
2226     /**
2227      * Scan a line break, transforming:
2228      * 
2229      * <pre>
2230      * '\r\n' : '\n'
2231      * '\r' : '\n'
2232      * '\n' : '\n'
2233      * '\x85' : '\n'
2234      * default : ''
2235      * </pre>
2236      */
2237     private String scanLineBreak() {
2238         // Transforms:
2239         // '\r\n' : '\n'
2240         // '\r' : '\n'
2241         // '\n' : '\n'
2242         // '\x85' : '\n'
2243         // default : ''
2244         char ch = reader.peek();
2245         if (ch == '\r' || ch == '\n' || ch == '\u0085') {
2246             if (ch == '\r' && '\n' == reader.peek(1)) {
2247                 reader.forward(2);
2248             } else {
2249                 reader.forward();
2250             }
2251             return "\n";
2252         } else if (ch == '\u2028' || ch == '\u2029') {
2253             reader.forward();
2254             return String.valueOf(ch);
2255         }
2256         return "";
2257     }
2258 
2259     /**
2260      * Chomping the tail may have 3 values - yes, no, not defined.
2261      */
2262     private static class Chomping {
2263         private final Boolean value;
2264         private final int increment;
2265 
2266         public Chomping(Boolean value, int increment) {
2267             this.value = value;
2268             this.increment = increment;
2269         }
2270 
2271         public boolean chompTailIsNotFalse() {
2272             return value == null || value;
2273         }
2274 
2275         public boolean chompTailIsTrue() {
2276             return value != null && value;
2277         }
2278 
2279         public int getIncrement() {
2280             return increment;
2281         }
2282     }
2283 }