Source for gnu.xml.pipeline.ValidationConsumer

   1: /* ValidationConsumer.java -- 
   2:    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: package gnu.xml.pipeline;
  39: 
  40: import java.io.IOException;
  41: import java.io.StringReader;
  42: import java.io.StringWriter;
  43: import java.util.EmptyStackException;
  44: import java.util.Enumeration;
  45: import java.util.Hashtable;
  46: import java.util.Stack;
  47: import java.util.StringTokenizer;
  48: import java.util.Vector;
  49: 
  50: import org.xml.sax.Attributes;
  51: import org.xml.sax.EntityResolver;
  52: import org.xml.sax.ErrorHandler;
  53: import org.xml.sax.InputSource;
  54: import org.xml.sax.Locator;
  55: import org.xml.sax.SAXException;
  56: import org.xml.sax.SAXParseException;
  57: import org.xml.sax.XMLReader;
  58: import org.xml.sax.helpers.XMLReaderFactory;
  59: 
  60: /**
  61:  * This class checks SAX2 events to report validity errors; it works as
  62:  * both a filter and a terminus on an event pipeline.  It relies on the
  63:  * producer of SAX events to:  </p> <ol>
  64:  *
  65:  *    <li> Conform to the specification of a non-validating XML parser that
  66:  *    reads all external entities, reported using SAX2 events. </li>
  67:  *
  68:  *    <li> Report ignorable whitespace as such (through the ContentHandler
  69:  *    interface).  This is, strictly speaking, optional for nonvalidating
  70:  *    XML processors.  </li>
  71:  *
  72:  *    <li> Make SAX2 DeclHandler callbacks, with default
  73:  *    attribute values already normalized (and without "&lt;").</li>
  74:  *
  75:  *    <li> Make SAX2 LexicalHandler startDTD() and endDTD ()
  76:  *    callbacks. </li>
  77:  *
  78:  *    <li> Act as if the <em>(URI)/namespace-prefixes</em> property were
  79:  *    set to true, by providing XML 1.0 names and all <code>xmlns*</code>
  80:  *    attributes (rather than omitting either or both). </li>
  81:  *
  82:  *    </ol>
  83:  *
  84:  * <p> At this writing, the major SAX2 parsers (such as &AElig;lfred2,
  85:  * Crimson, and Xerces) meet these requirements, and this validation
  86:  * module is used by the optional &AElig;lfred2 validation support.
  87:  * </p>
  88:  *
  89:  * <p> Note that because this is a layered validator, it has to duplicate some
  90:  * work that the parser is doing; there are also other cost to layering.
  91:  * However, <em>because of layering it doesn't need a parser</em> in order
  92:  * to work! You can use it with anything that generates SAX events, such
  93:  * as an application component that wants to detect invalid content in
  94:  * a changed area without validating an entire document, or which wants to
  95:  * ensure that it doesn't write invalid data to a communications partner.</p>
  96:  *
  97:  * <p> Also, note that because this is a layered validator, the line numbers
  98:  * reported for some errors may seem strange.  For example, if an element does
  99:  * not permit character content, the validator
 100:  * will use the locator provided to it.
 101:  * That might reflect the last character of a <em>characters</em> event
 102:  * callback, rather than the first non-whitespace character. </p>
 103:  *
 104:  * <hr />
 105:  *
 106:  * <!--
 107:  * <p> Of interest is the fact that unlike most currently known XML validators,
 108:  * this one can report some cases of non-determinism in element content models.
 109:  * It is a compile-time option, enabled by default.  This will only report
 110:  * such XML errors if they relate to content actually appearing in a document;
 111:  * content models aren't aggressively scanned for non-deterministic structure.
 112:  * Documents which trigger such non-deterministic transitions may be handled
 113:  * differently by different validating parsers, without losing conformance
 114:  * to the XML specification. </p>
 115:  * -->
 116:  *
 117:  * <p> Current limitations of the validation performed are in roughly three
 118:  * categories.  </p>
 119:  *
 120:  * <p> The first category represents constraints which demand violations
 121:  * of software layering:  exposing lexical details, one of the first things
 122:  * that <em>application</em> programming interfaces (APIs) hide.  These
 123:  * invariably relate to XML entity handling, and to historical oddities
 124:  * of the XML validation semantics.  Curiously,
 125:  * recent (Autumn 1999) conformance testing showed that these constraints are
 126:  * among those handled worst by existing XML validating parsers.  Arguments
 127:  * have been made that each of these VCs should be turned into WFCs (most
 128:  * of them) or discarded (popular for the standalone declaration); in short,
 129:  * that these are bugs in the XML specification (not all via SGML): </p><ul>
 130:  *
 131:  *    <li> The <em>Proper Declaration/PE Nesting</em> and
 132:  *    <em>Proper Group/PE Nesting</em> VCs can't be tested because they
 133:  *    require access to particularly low level lexical level information.
 134:  *    In essence, the reason XML isn't a simple thing to parse is that
 135:  *    it's not a context free grammar, and these constraints elevate that
 136:  *    SGML-derived context sensitivity to the level of a semantic rule.
 137:  *
 138:  *    <li> The <em>Standalone Document Declaration</em> VC can't be
 139:  *    tested.  This is for two reasons.  First, this flag isn't made
 140:  *    available through SAX2.  Second, it also requires breaking that
 141:  *    lexical layering boundary.  (If you ever wondered why classes
 142:  *    in compiler construction or language design barely mention the
 143:  *    existence of context-sensitive grammars, it's because of messy
 144:  *    issues like these.)
 145:  *
 146:  *    <li> The <em>Entity Declared</em> VC can't be tested, because it
 147:  *    also requires breaking that lexical layering boundary!  There's also
 148:  *    another issue: the VC wording (and seemingly intent) is ambiguous.
 149:  *    (This is still true in the "Second edition" XML spec.)
 150:  *    Since there is a WFC of the same name, everyone's life would be
 151:  *    easier if references to undeclared parsed entities were always well
 152:  *    formedness errors, regardless of whether they're parameter entities
 153:  *    or not.  (Note that nonvalidating parsers are not required
 154:  *    to report all such well formedness errors if they don't read external
 155:  *    parameter entities, although currently most XML parsers read them
 156:  *    in an attempt to avoid problems from inconsistent parser behavior.)
 157:  *
 158:  *    </ul>
 159:  *
 160:  * <p> The second category of limitations on this validation represent
 161:  * constraints associated with information that is not guaranteed to be
 162:  * available (or in one case, <em>is guaranteed not to be available</em>,
 163:  * through the SAX2 API: </p><ul>
 164:  *
 165:  *    <li> The <em>Unique Element Type Declaration</em> VC may not be
 166:  *    reportable, if the underlying parser happens not to expose
 167:  *    multiple declarations.   (&AElig;lfred2 reports these validity
 168:  *    errors directly.)</li>
 169:  *
 170:  *    <li> Similarly, the <em>Unique Notation Name</em> VC, added in the
 171:  *    14-January-2000 XML spec errata to restrict typing models used by
 172:  *    elements, may not be reportable.  (&AElig;lfred reports these
 173:  *    validity errors directly.) </li>
 174:  *
 175:  *    </ul>
 176:  *
 177:  * <p> A third category relates to ease of implementation.  (Think of this
 178:  * as "bugs".)  The most notable issue here is character handling.  Rather
 179:  * than attempting to implement the voluminous character tables in the XML
 180:  * specification (Appendix B), Unicode rules are used directly from
 181:  * the java.lang.Character class.  Recent JVMs have begun to diverge from
 182:  * the original specification for that class (Unicode 2.0), meaning that
 183:  * different JVMs may handle that aspect of conformance differently.
 184:  * </p>
 185:  *
 186:  * <p> Note that for some of the validity errors that SAX2 does not
 187:  * expose, a nonvalidating parser is permitted (by the XML specification)
 188:  * to report validity errors.  When used with a parser that does so for
 189:  * the validity constraints mentioned above (or any other SAX2 event
 190:  * stream producer that does the same thing), overall conformance is
 191:  * substantially improved.
 192:  *
 193:  * @see gnu.xml.aelfred2.SAXDriver
 194:  * @see gnu.xml.aelfred2.XmlReader
 195:  *
 196:  * @author David Brownell
 197:  */
 198: public final class ValidationConsumer extends EventFilter
 199: {
 200:     // report error if we happen to notice a non-deterministic choice?
 201:     // we won't report buggy content models; just buggy instances
 202:     private static final boolean    warnNonDeterministic = false;
 203: 
 204:     // for tracking active content models
 205:     private String        rootName;
 206:     private Stack        contentStack = new Stack ();
 207: 
 208:     // flags for "saved DTD" processing
 209:     private boolean        disableDeclarations;
 210:     private boolean        disableReset;
 211: 
 212:     //
 213:     // most VCs get tested when we see element start tags.  the per-element
 214:     // info (including attributes) recorded here duplicates that found inside
 215:     // many nonvalidating parsers, hence dual lookups etc ... that's why a
 216:     // layered validator isn't going to be as fast as a non-layered one.
 217:     //
 218: 
 219:     // key = element name; value = ElementInfo
 220:     private Hashtable        elements = new Hashtable ();
 221: 
 222:     // some VCs relate to ID/IDREF/IDREFS attributes
 223:     // key = id; value = boolean true (defd) or false (refd)
 224:     private Hashtable        ids = new Hashtable ();
 225: 
 226:     // we just record declared notation and unparsed entity names.
 227:     // the implementation here is simple/slow; these features
 228:     // are seldom used, one hopes they'll wither away soon
 229:     private Vector        notations = new Vector (5, 5);
 230:     private Vector        nDeferred = new Vector (5, 5);
 231:     private Vector        unparsed = new Vector (5, 5);
 232:     private Vector        uDeferred = new Vector (5, 5);
 233:     
 234:     // note: DocBk 3.1.7 XML defines over 2 dozen notations,
 235:     // used when defining unparsed entities for graphics
 236:     // (and maybe in other places)
 237: 
 238:     
 239: 
 240:     /**
 241:      * Creates a pipeline terminus which consumes all events passed to
 242:      * it; this will report validity errors as if they were fatal errors,
 243:      * unless an error handler is assigned.
 244:      *
 245:      * @see #setErrorHandler
 246:      */
 247:     // constructor used by PipelineFactory
 248:         // ... and want one taking system ID of an external subset
 249:     public ValidationConsumer ()
 250:     {
 251:     this (null);
 252:     }
 253: 
 254:     /**
 255:      * Creates a pipeline filter which reports validity errors and then
 256:      * passes events on to the next consumer if they were not fatal.
 257:      *
 258:      * @see #setErrorHandler
 259:      */
 260:     // constructor used by PipelineFactory
 261:         // ... and want one taking system ID of an external subset
 262:         // (which won't send declaration events)
 263:     public ValidationConsumer (EventConsumer next)
 264:     {
 265:     super (next);
 266: 
 267:     setContentHandler (this);
 268:     setDTDHandler (this);
 269:     try { setProperty (DECL_HANDLER, this); }
 270:     catch (Exception e) { /* "can't happen" */ }
 271:     try { setProperty (LEXICAL_HANDLER, this); }
 272:     catch (Exception e) { /* "can't happen" */ }
 273:     }
 274: 
 275:     
 276:     private static final String    fakeRootName
 277:     = ":Nobody:in:their_Right.Mind_would:use:this-name:1x:";
 278:     
 279:     /**
 280:      * Creates a validation consumer which is preloaded with the DTD provided.
 281:      * It does this by constructing a document with that DTD, then parsing
 282:      * that document and recording its DTD declarations.  Then it arranges
 283:      * not to modify that information.
 284:      *
 285:      * <p> The resulting validation consumer will only validate against
 286:      * the specified DTD, regardless of whether some other DTD is found
 287:      * in a document being parsed.
 288:      *
 289:      * @param rootName The name of the required root element; if this is
 290:      *    null, any root element name will be accepted.
 291:      * @param publicId If non-null and there is a non-null systemId, this
 292:      *    identifier provides an alternate access identifier for the DTD's
 293:      *    external subset.
 294:      * @param systemId If non-null, this is a URI (normally URL) that
 295:      *    may be used to access the DTD's external subset.
 296:      * @param internalSubset If non-null, holds literal markup declarations
 297:      *    comprising the DTD's internal subset.
 298:      * @param resolver If non-null, this will be provided to the parser for
 299:      *    use when resolving parameter entities (including any external subset).
 300:      * @param resolver If non-null, this will be provided to the parser for
 301:      *    use when resolving parameter entities (including any external subset).
 302:      * @param minimalElement If non-null, a minimal valid document.
 303:      *
 304:      * @exception SAXNotSupportedException If the default SAX parser does
 305:      *    not support the standard lexical or declaration handlers.
 306:      * @exception SAXParseException If the specified DTD has either
 307:      *    well-formedness or validity errors
 308:      * @exception IOException If the specified DTD can't be read for
 309:      *    some reason
 310:      */
 311:     public ValidationConsumer (
 312:     String        rootName,
 313:     String        publicId,
 314:     String        systemId,
 315:     String        internalSubset,
 316:     EntityResolver    resolver,
 317:     String        minimalDocument
 318:     ) throws SAXException, IOException
 319:     {
 320:     this (null);
 321: 
 322:     disableReset = true;
 323:     if (rootName == null)
 324:         rootName = fakeRootName;
 325: 
 326:     //
 327:     // Synthesize document with that DTD; is it possible to do
 328:     // better for the declaration of the root element?
 329:     //
 330:     // NOTE:  can't use SAX2 to write internal subsets.
 331:     //
 332:     StringWriter    writer = new StringWriter ();
 333: 
 334:     writer.write ("<!DOCTYPE ");
 335:     writer.write (rootName);
 336:     if (systemId != null) {
 337:         writer.write ("\n  ");
 338:         if (publicId != null) {
 339:         writer.write ("PUBLIC '");
 340:         writer.write (publicId);
 341:         writer.write ("'\n\t'");
 342:         } else
 343:         writer.write ("SYSTEM '");
 344:         writer.write (systemId);
 345:         writer.write ("'");
 346:     }
 347:     writer.write (" [ ");
 348:     if (rootName == fakeRootName) {
 349:         writer.write ("\n<!ELEMENT ");
 350:         writer.write (rootName);
 351:         writer.write (" EMPTY>");
 352:     }
 353:     if (internalSubset != null)
 354:         writer.write (internalSubset);
 355:     writer.write ("\n ]>");
 356: 
 357:     if (minimalDocument != null) {
 358:         writer.write ("\n");
 359:         writer.write (minimalDocument);
 360:         writer.write ("\n");
 361:     } else {
 362:         writer.write (" <");
 363:         writer.write (rootName);
 364:         writer.write ("/>\n");
 365:     }
 366:     minimalDocument = writer.toString ();
 367: 
 368:     //
 369:     // OK, load it
 370:     //
 371:     XMLReader    producer;
 372: 
 373:     producer = XMLReaderFactory.createXMLReader ();
 374:     bind (producer, this);
 375: 
 376:     if (resolver != null)
 377:         producer.setEntityResolver (resolver);
 378: 
 379:     InputSource    in;
 380:     
 381:     in = new InputSource (new StringReader (minimalDocument));
 382:     producer.parse (in);
 383: 
 384:     disableDeclarations = true;
 385:     if (rootName == fakeRootName)
 386:         this.rootName = null;
 387:     }
 388: 
 389:     private void resetState ()
 390:     {
 391:     if (!disableReset) {
 392:         rootName = null;
 393:         contentStack.removeAllElements ();
 394:         elements.clear ();
 395:         ids.clear ();
 396: 
 397:         notations.removeAllElements ();
 398:         nDeferred.removeAllElements ();
 399:         unparsed.removeAllElements ();
 400:         uDeferred.removeAllElements ();
 401:     }
 402:     }
 403: 
 404: 
 405:     private void warning (String description)
 406:     throws SAXException
 407:     {
 408:     ErrorHandler        errHandler = getErrorHandler ();
 409:     Locator            locator = getDocumentLocator ();
 410:     SAXParseException    err;
 411: 
 412:     if (errHandler == null)
 413:         return;
 414: 
 415:     if (locator == null)
 416:         err = new SAXParseException (description, null, null, -1, -1);
 417:     else
 418:         err = new SAXParseException (description, locator);
 419:     errHandler.warning (err);
 420:     }
 421: 
 422:     // package private (for ChildrenRecognizer)
 423:     private void error (String description)
 424:     throws SAXException
 425:     {
 426:     ErrorHandler        errHandler = getErrorHandler ();
 427:     Locator            locator = getDocumentLocator ();
 428:     SAXParseException    err;
 429: 
 430:     if (locator == null)
 431:         err = new SAXParseException (description, null, null, -1, -1);
 432:     else
 433:         err = new SAXParseException (description, locator);
 434:     if (errHandler != null)
 435:         errHandler.error (err);
 436:     else    // else we always treat it as fatal!
 437:         throw err;
 438:     }
 439: 
 440:     private void fatalError (String description)
 441:     throws SAXException
 442:     {
 443:     ErrorHandler        errHandler = getErrorHandler ();
 444:     Locator            locator = getDocumentLocator ();
 445:     SAXParseException    err;
 446: 
 447:     if (locator != null)
 448:         err = new SAXParseException (description, locator);
 449:     else
 450:         err = new SAXParseException (description, null, null, -1, -1);
 451:     if (errHandler != null)
 452:         errHandler.fatalError (err);
 453:     // we always treat this as fatal, regardless of the handler
 454:     throw err;
 455:     }
 456: 
 457: 
 458:     private static boolean isExtender (char c)
 459:     {
 460:     // [88] Extender ::= ...
 461:     return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
 462:            || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
 463:            || (c >= 0x3031 && c <= 0x3035)
 464:            || (c >= 0x309d && c <= 0x309e)
 465:            || (c >= 0x30fc && c <= 0x30fe);
 466:     }
 467: 
 468: 
 469:     // use augmented Unicode rules, not full XML rules
 470:     private boolean isName (String name, String context, String id)
 471:     throws SAXException
 472:     {
 473:     char    buf [] = name.toCharArray ();
 474:     boolean    pass = true;
 475: 
 476:     if (!Character.isUnicodeIdentifierStart (buf [0])
 477:         && ":_".indexOf (buf [0]) == -1)
 478:         pass = false;
 479:     else {
 480:         int max = buf.length;
 481:         for (int i = 1; pass && i < max; i++) {
 482:         char c = buf [i];
 483:         if (!Character.isUnicodeIdentifierPart (c)
 484:             && ":-_.".indexOf (c) == -1
 485:             && !isExtender (c))
 486:             pass = false;
 487:         }
 488:     }
 489: 
 490:     if (!pass)
 491:         error ("In " + context + " for " + id
 492:         + ", '" + name + "' is not a name");
 493:     return pass;    // true == OK
 494:     }
 495: 
 496:     // use augmented Unicode rules, not full XML rules
 497:     private boolean isNmtoken (String nmtoken, String context, String id)
 498:     throws SAXException
 499:     {
 500:     char    buf [] = nmtoken.toCharArray ();
 501:     boolean    pass = true;
 502:     int    max = buf.length;
 503: 
 504:     // XXX make this share code with isName
 505: 
 506:     for (int i = 0; pass && i < max; i++) {
 507:         char c = buf [i];
 508:         if (!Character.isUnicodeIdentifierPart (c)
 509:             && ":-_.".indexOf (c) == -1
 510:             && !isExtender (c))
 511:         pass = false;
 512:     }
 513: 
 514:     if (!pass)
 515:         error ("In " + context + " for " + id
 516:         + ", '" + nmtoken + "' is not a name token");
 517:     return pass;    // true == OK
 518:     }
 519: 
 520:     private void checkEnumeration (String value, String type, String name)
 521:     throws SAXException
 522:     {
 523:     if (!hasMatch (value, type))
 524:         // VC: Enumeration
 525:         error ("Value '" + value
 526:         + "' for attribute '" + name
 527:         + "' is not permitted: " + type);
 528:     }
 529: 
 530:     // used to test enumerated attributes and mixed content models
 531:     // package private
 532:     static boolean hasMatch (String value, String orList)
 533:     {
 534:     int len = value.length ();
 535:     int max = orList.length () - len;
 536: 
 537:     for (int start = 0;
 538:         (start = orList.indexOf (value, start)) != -1;
 539:         start++) {
 540:         char c;
 541: 
 542:         if (start > max)
 543:         break;
 544:         c = orList.charAt (start - 1);
 545:         if (c != '|' && c != '('/*)*/)
 546:         continue;
 547:         c = orList.charAt (start + len);
 548:         if (c != '|' && /*(*/ c != ')')
 549:         continue;
 550:         return true;
 551:     }
 552:     return false;
 553:     }
 554: 
 555:     /**
 556:      * <b>LexicalHandler</b> Records the declaration of the root
 557:      * element, so it can be verified later.
 558:      * Passed to the next consumer, unless this one was
 559:      * preloaded with a particular DTD.
 560:      */
 561:     public void startDTD (String name, String publicId, String systemId)
 562:     throws SAXException
 563:     {
 564:     if (disableDeclarations)
 565:         return;
 566: 
 567:     rootName = name;
 568:     super.startDTD (name, publicId, systemId);
 569:     }
 570: 
 571:     /**
 572:      * <b>LexicalHandler</b> Verifies that all referenced notations
 573:      * and unparsed entities have been declared.
 574:      * Passed to the next consumer, unless this one was
 575:      * preloaded with a particular DTD.
 576:      */
 577:     public void endDTD ()
 578:     throws SAXException
 579:     {
 580:     if (disableDeclarations)
 581:         return;
 582: 
 583:     // this is a convenient hook for end-of-dtd checks, but we
 584:     // could also trigger it in the first startElement call.
 585:     // locator info is more appropriate here though.
 586: 
 587:     // VC: Notation Declared (NDATA can refer to them before decls,
 588:     //    as can NOTATION attribute enumerations and defaults)
 589:     int length = nDeferred.size ();
 590:     for (int i = 0; i < length; i++) {
 591:         String notation = (String) nDeferred.elementAt (i);
 592:         if (!notations.contains (notation)) {
 593:         error ("A declaration referred to notation '" + notation
 594:             + "' which was never declared");
 595:         }
 596:     }
 597:     nDeferred.removeAllElements ();
 598: 
 599:     // VC: Entity Name (attribute values can refer to them
 600:     //    before they're declared); VC Attribute Default Legal
 601:     length = uDeferred.size ();
 602:     for (int i = 0; i < length; i++) {
 603:         String entity = (String) uDeferred.elementAt (i);
 604:         if (!unparsed.contains (entity)) {
 605:         error ("An attribute default referred to entity '" + entity
 606:             + "' which was never declared");
 607:         }
 608:     }
 609:     uDeferred.removeAllElements ();
 610:     super.endDTD ();
 611:     }
 612: 
 613: 
 614:     // These are interned, so we can rely on "==" to find the type of
 615:     // all attributes except enumerations ...
 616:     // "(this|or|that|...)" and "NOTATION (this|or|that|...)"
 617:     static final String types [] = {
 618:     "CDATA",
 619:     "ID", "IDREF", "IDREFS",
 620:     "NMTOKEN", "NMTOKENS",
 621:     "ENTITY", "ENTITIES"
 622:     };
 623: 
 624: 
 625:     /**
 626:      * <b>DecllHandler</b> Records attribute declaration for later use
 627:      * in validating document content, and checks validity constraints
 628:      * that are applicable to attribute declarations.
 629:      * Passed to the next consumer, unless this one was
 630:      * preloaded with a particular DTD.
 631:      */
 632:     public void attributeDecl (
 633:     String eName,
 634:     String aName,
 635:     String type,
 636:     String mode,
 637:     String value
 638:     ) throws SAXException
 639:     {
 640:     if (disableDeclarations)
 641:         return;
 642: 
 643:     ElementInfo    info = (ElementInfo) elements.get (eName);
 644:     AttributeInfo    ainfo = new AttributeInfo ();
 645:     boolean        checkOne = false;
 646:     boolean        interned = false;
 647: 
 648:     // cheap interning of type names and #FIXED, #REQUIRED
 649:     // for faster startElement (we can use "==")
 650:     for (int i = 0; i < types.length; i++) {
 651:         if (types [i].equals (type)) {
 652:         type = types [i];
 653:         interned = true;
 654:         break;
 655:         }
 656:     }
 657:     if ("#FIXED".equals (mode))
 658:         mode = "#FIXED";
 659:     else if ("#REQUIRED".equals (mode))
 660:         mode = "#REQUIRED";
 661: 
 662:     ainfo.type = type;
 663:     ainfo.mode = mode;
 664:     ainfo.value = value;
 665: 
 666:     // we might not have seen the content model yet
 667:     if (info == null) {
 668:         info = new ElementInfo (eName);
 669:         elements.put (eName, info);
 670:     }
 671:     if ("ID" == type) {
 672:         checkOne = true;
 673:         if (!("#REQUIRED" == mode || "#IMPLIED".equals (mode))) {
 674:         // VC: ID Attribute Default
 675:         error ("ID attribute '" + aName
 676:             + "' must be #IMPLIED or #REQUIRED");
 677:         }
 678: 
 679:     } else if (!interned && type.startsWith ("NOTATION ")) {
 680:         checkOne = true;
 681: 
 682:         // VC: Notation Attributes (notations must be declared)
 683:         StringTokenizer    tokens = new StringTokenizer (
 684:         type.substring (10, type.lastIndexOf (')')),
 685:         "|");
 686:         while (tokens.hasMoreTokens ()) {
 687:         String    token = tokens.nextToken ();
 688:         if (!notations.contains (token))
 689:             nDeferred.addElement (token);
 690:         }
 691:     }
 692:     if (checkOne) {
 693:         for (Enumeration e = info.attributes.keys ();
 694:             e.hasMoreElements ();
 695:             /* NOP */) {
 696:         String        name;
 697:         AttributeInfo    ainfo2;
 698: 
 699:         name = (String) e.nextElement ();
 700:         ainfo2 = (AttributeInfo) info.attributes.get (name);
 701:         if (type == ainfo2.type || !interned /* NOTATION */) {
 702:             // VC: One ID per Element Type
 703:             // VC: One Notation per Element TYpe
 704:             error ("Element '" + eName
 705:             + "' already has an attribute of type "
 706:             + (interned ? "NOTATION" : type)
 707:             + " ('" + name
 708:             + "') so '" + aName 
 709:             + "' is a validity error");
 710:         }
 711:         }
 712:     }
 713: 
 714:     // VC: Attribute Default Legal
 715:     if (value != null) {
 716: 
 717:         if ("CDATA" == type) {
 718:         // event source rejected '<'
 719: 
 720:         } else if ("NMTOKEN" == type) {
 721:         // VC: Name Token (is a nmtoken)
 722:         isNmtoken (value, "attribute default", aName);
 723: 
 724:         } else if ("NMTOKENS" == type) {
 725:         // VC: Name Token (is a nmtoken; at least one value)
 726:         StringTokenizer    tokens = new StringTokenizer (value);
 727:         if (!tokens.hasMoreTokens ())
 728:             error ("Default for attribute '" + aName
 729:             + "' must have at least one name token.");
 730:         else do {
 731:             String token = tokens.nextToken ();
 732:             isNmtoken (token, "attribute default", aName);
 733:         } while (tokens.hasMoreTokens ());
 734: 
 735:         } else if ("IDREF" == type || "ENTITY" == type) {
 736:         // VC: Entity Name (is a name)
 737:         // VC: IDREF (is a name) (is declared)
 738:         isName (value, "attribute default", aName);
 739:         if ("ENTITY" == type && !unparsed.contains (value))
 740:             uDeferred.addElement (value);
 741: 
 742:         } else if ("IDREFS" == type || "ENTITIES" == type) {
 743:         // VC: Entity Name (is a name; at least one value)
 744:         // VC: IDREF (is a name; at least one value)
 745:         StringTokenizer    names = new StringTokenizer (value);
 746:         if (!names.hasMoreTokens ())
 747:             error ("Default for attribute '" + aName
 748:             + "' must have at least one name.");
 749:         else do {
 750:             String name = names.nextToken ();
 751:             isName (name, "attribute default", aName);
 752:             if ("ENTITIES" == type && !unparsed.contains (name))
 753:             uDeferred.addElement (value);
 754:         } while (names.hasMoreTokens ());
 755:         
 756:         } else if (type.charAt (0) == '(' /*)*/ ) {
 757:         // VC: Enumeration (must match)
 758:         checkEnumeration (value, type, aName);
 759: 
 760:         } else if (!interned && checkOne) {    /* NOTATION */
 761:         // VC: Notation attributes (must be names)
 762:         isName (value, "attribute default", aName);
 763: 
 764:         // VC: Notation attributes (must be declared)
 765:         if (!notations.contains (value))
 766:             nDeferred.addElement (value);
 767:         
 768:         // VC: Enumeration (must match)
 769:         checkEnumeration (value, type, aName);
 770: 
 771:         } else if ("ID" != type)
 772:         throw new RuntimeException ("illegal attribute type: " + type);
 773:     }
 774: 
 775:     if (info.attributes.get (aName) == null)
 776:         info.attributes.put (aName, ainfo);
 777:     /*
 778:     else
 779:         warning ("Element '" + eName
 780:         + "' already has an attribute named '" + aName + "'");
 781:     */
 782: 
 783:     if ("xml:space".equals (aName)) {
 784:         if (!("(default|preserve)".equals (type)
 785:             || "(preserve|default)".equals (type)
 786:             // these next two are arguable; XHTML's DTD doesn't
 787:             // deserve errors.  After all, it's not like any
 788:             // illegal _value_ could pass ...
 789:             || "(preserve)".equals (type)
 790:             || "(default)".equals (type)
 791:             ))
 792:         error (
 793:             "xml:space attribute type must be like '(default|preserve)'"
 794:             + " not '" + type + "'"
 795:             );
 796: 
 797:     }
 798:     super.attributeDecl (eName, aName, type, mode, value);
 799:     }
 800: 
 801:     /**
 802:      * <b>DecllHandler</b> Records the element declaration for later use
 803:      * when checking document content, and checks validity constraints that
 804:      * apply to element declarations.  Passed to the next consumer, unless
 805:      * this one was preloaded with a particular DTD.
 806:      */
 807:     public void elementDecl (String name, String model)
 808:     throws SAXException
 809:     {
 810:     if (disableDeclarations)
 811:         return;
 812: 
 813:     ElementInfo    info = (ElementInfo) elements.get (name);
 814: 
 815:     // we might have seen an attribute decl already
 816:     if (info == null) {
 817:         info = new ElementInfo (name);
 818:         elements.put (name, info);
 819:     }
 820:     if (info.model != null) {
 821:         // NOTE:  not all parsers can report such duplicates.
 822:         // VC: Unique Element Type Declaration
 823:         error ("Element type '" + name
 824:         + "' was already declared.");
 825:     } else {
 826:         info.model = model;
 827: 
 828:         // VC: No Duplicate Types (in mixed content models)
 829:         if (model.charAt (1) == '#')     // (#PCDATA...
 830:         info.getRecognizer (this);
 831:     }
 832:     super.elementDecl (name, model);
 833:     }
 834: 
 835:     /**
 836:      * <b>DecllHandler</b> passed to the next consumer, unless this
 837:      * one was preloaded with a particular DTD
 838:      */
 839:     public void internalEntityDecl (String name, String value)
 840:     throws SAXException
 841:     {
 842:     if (!disableDeclarations)
 843:         super.internalEntityDecl (name, value);
 844:     }
 845: 
 846:     /**
 847:      * <b>DecllHandler</b> passed to the next consumer, unless this
 848:      * one was preloaded with a particular DTD
 849:      */
 850:     public void externalEntityDecl (String name,
 851:         String publicId, String systemId)
 852:     throws SAXException
 853:     {
 854:     if (!disableDeclarations)
 855:         super.externalEntityDecl (name, publicId, systemId);
 856:     }
 857: 
 858: 
 859:     /**
 860:      * <b>DTDHandler</b> Records the notation name, for checking
 861:      * NOTATIONS attribute values and declararations of unparsed
 862:      * entities.  Passed to the next consumer, unless this one was
 863:      * preloaded with a particular DTD.
 864:      */
 865:     public void notationDecl (String name, String publicId, String systemId)
 866:     throws SAXException
 867:     {
 868:     if (disableDeclarations)
 869:         return;
 870: 
 871:     notations.addElement (name);
 872:     super.notationDecl (name, publicId, systemId);
 873:     }
 874: 
 875:     /**
 876:      * <b>DTDHandler</b> Records the entity name, for checking
 877:      * ENTITY and ENTITIES attribute values; records the notation
 878:      * name if it hasn't yet been declared.  Passed to the next consumer,
 879:      * unless this one was preloaded with a particular DTD.
 880:      */
 881:     public void unparsedEntityDecl (
 882:     String name,
 883:     String publicId,
 884:     String systemId,
 885:     String notationName
 886:     ) throws SAXException
 887:     {
 888:     if (disableDeclarations)
 889:         return;
 890: 
 891:     unparsed.addElement (name);
 892:     if (!notations.contains (notationName))
 893:         nDeferred.addElement (notationName);
 894:     super.unparsedEntityDecl (name, publicId, systemId, notationName);
 895:     }
 896:     
 897:     
 898:     /**
 899:      * <b>ContentHandler</b> Ensures that state from any previous parse
 900:      * has been deleted.
 901:      * Passed to the next consumer.
 902:      */
 903:     public void startDocument ()
 904:     throws SAXException
 905:     {
 906:     resetState ();
 907:     super.startDocument ();
 908:     }
 909: 
 910: 
 911:     private static boolean isAsciiLetter (char c)
 912:     {
 913:     return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 914:     }
 915: 
 916: 
 917:     /**
 918:      * <b>ContentHandler</b> Reports a fatal exception.  Validating
 919:      * XML processors may not skip any entities.
 920:      */
 921:     public void skippedEntity (String name)
 922:     throws SAXException
 923:     {
 924:     fatalError ("may not skip entities");
 925:     }
 926: 
 927:     /*
 928:      * SAX2 doesn't expand non-PE refs in attribute defaults...
 929:      */
 930:     private String expandDefaultRefs (String s)
 931:     throws SAXException
 932:     {
 933:     if (s.indexOf ('&') < 0)
 934:         return s;
 935:     
 936: // FIXME: handle &#nn; &#xnn; &name;
 937:     String message = "Can't expand refs in attribute default: " + s;
 938:     warning (message);
 939: 
 940:     return s;
 941:     }
 942: 
 943:     /**
 944:      * <b>ContentHandler</b> Performs validity checks against element
 945:      * (and document) content models, and attribute values.
 946:      * Passed to the next consumer.
 947:      */
 948:     public void startElement (
 949:     String        uri,
 950:     String        localName,
 951:     String        qName,
 952:     Attributes    atts
 953:     ) throws SAXException
 954:     {
 955:     //
 956:     // First check content model for the enclosing scope.
 957:     //
 958:     if (contentStack.isEmpty ()) {
 959:         // VC:  Root Element Type
 960:         if (!qName.equals (rootName)) {
 961:         if (rootName == null)
 962:             warning ("This document has no DTD, can't be valid");
 963:         else
 964:             error ("Root element type '" + qName
 965:             + "' was declared to be '" + rootName + "'");
 966:         }
 967:     } else {
 968:         Recognizer state = (Recognizer) contentStack.peek ();
 969: 
 970:         if (state != null) {
 971:         Recognizer newstate = state.acceptElement (qName);
 972: 
 973:         if (newstate == null)
 974:             error ("Element type '" + qName
 975:             + "' in element '" + state.type.name
 976:             + "' violates content model " + state.type.model
 977:             );
 978:         if (newstate != state) {
 979:             contentStack.pop ();
 980:             contentStack.push (newstate);
 981:         }
 982:         }
 983:     }
 984: 
 985:     //
 986:     // Then check that this element was declared, and push the
 987:     // object used to validate its content model onto our stack.
 988:     //
 989:     // This is where the recognizer gets created, if needed; if
 990:     // it's a "children" (elements) content model, an NDFA is
 991:     // created.  (One recognizer is used per content type, no
 992:     // matter how complex that recognizer is.)
 993:     //
 994:     ElementInfo        info;
 995: 
 996:     info = (ElementInfo) elements.get (qName);
 997:     if (info == null || info.model == null) {
 998:         // VC: Element Valid (base clause)
 999:         error ("Element type '" + qName + "' was not declared");
1000:         contentStack.push (null);
1001: 
1002:         // for less diagnostic noise, fake a declaration.
1003:         elementDecl (qName, "ANY");
1004:     } else
1005:         contentStack.push (info.getRecognizer (this));
1006: 
1007:     //
1008:     // Then check each attribute present
1009:     //
1010:     int            len;
1011:     String            aname;
1012:     AttributeInfo        ainfo;
1013: 
1014:     if (atts != null)
1015:         len = atts.getLength ();
1016:     else
1017:         len = 0;
1018:     
1019:     for (int i = 0; i < len; i++) {
1020:         aname = atts.getQName (i);
1021: 
1022:         if (info == null
1023:             || (ainfo = (AttributeInfo) info.attributes.get (aname))
1024:                 == null) {
1025:         // VC: Attribute Value Type
1026:         error ("Attribute '" + aname
1027:             + "' was not declared for element type " + qName);
1028:         continue;
1029:         }
1030: 
1031:         String value = atts.getValue (i);
1032: 
1033:         // note that "==" for type names and "#FIXED" is correct
1034:         // (and fast) since we've interned those literals.
1035: 
1036:         if ("#FIXED" == ainfo.mode) {
1037:         String expanded = expandDefaultRefs (ainfo.value);
1038: 
1039:         // VC: Fixed Attribute Default
1040:         if (!value.equals (expanded)) {
1041:             error ("Attribute '" + aname
1042:             + "' must match " + expanded
1043:             );
1044:             continue;
1045:         }
1046:         }
1047: 
1048:         if ("CDATA" == ainfo.type)
1049:         continue;
1050:         
1051:         //
1052:         // For all other attribute types, there are various
1053:         // rules to follow.
1054:         //
1055:         
1056:         if ("ID" == ainfo.type) {
1057:         // VC: ID (must be a name)
1058:         if (isName (value, "ID attribute", aname)) {
1059:             if (Boolean.TRUE == ids.get (value))
1060:             // VC: ID (appears once)
1061:             error ("ID attribute " + aname
1062:                 + " uses an ID value '" + value
1063:                 + "' which was already declared.");
1064:             else
1065:             // any forward refs are no longer problems
1066:             ids.put (value, Boolean.TRUE);
1067:         }
1068:         continue;
1069:         } 
1070: 
1071:         if ("IDREF" == ainfo.type) {
1072:         // VC: IDREF (value must be a name)
1073:         if (isName (value, "IDREF attribute", aname)) {
1074:             // VC: IDREF (must match some ID attribute)
1075:             if (ids.get (value) == null)
1076:             // new -- assume it's a forward ref
1077:             ids.put (value, Boolean.FALSE);
1078:         }
1079:         continue;
1080:         } 
1081: 
1082:         if ("IDREFS" == ainfo.type) {
1083:         StringTokenizer    tokens = new StringTokenizer (value, " ");
1084: 
1085:         if (!tokens.hasMoreTokens ()) {
1086:             // VC: IDREF (one or more values)
1087:             error ("IDREFS attribute " + aname
1088:             + " must have at least one ID ref");
1089:         } else do {
1090:             String id = tokens.nextToken ();
1091: 
1092:             // VC: IDREF (value must be a name)
1093:             if (isName (id, "IDREFS attribute", aname)) {
1094:             // VC: IDREF (must match some ID attribute)
1095:             if (ids.get (id) == null)
1096:                 // new -- assume it's a forward ref
1097:                 ids.put (id, Boolean.FALSE);
1098:             }
1099:         } while (tokens.hasMoreTokens ());
1100:         continue;
1101:         }
1102: 
1103:         if ("NMTOKEN" == ainfo.type) {
1104:         // VC: Name Token (is a name token)
1105:         isNmtoken (value, "NMTOKEN attribute", aname);
1106:         continue;
1107:         }
1108: 
1109:         if ("NMTOKENS" == ainfo.type) {
1110:         StringTokenizer    tokens = new StringTokenizer (value, " ");
1111: 
1112:         if (!tokens.hasMoreTokens ()) {
1113:             // VC: Name Token (one or more values)
1114:             error ("NMTOKENS attribute " + aname
1115:             + " must have at least one name token");
1116:         } else do {
1117:             String token = tokens.nextToken ();
1118: 
1119:             // VC: Name Token (is a name token)
1120:             isNmtoken (token, "NMTOKENS attribute", aname);
1121:         } while (tokens.hasMoreTokens ());
1122:         continue;
1123:         }
1124: 
1125:         if ("ENTITY" == ainfo.type) {
1126:         if (!unparsed.contains (value))
1127:             // VC: Entity Name
1128:             error ("Value of attribute '" + aname
1129:             + "' refers to unparsed entity '" + value
1130:             + "' which was not declared.");
1131:         continue;
1132:         }
1133: 
1134:         if ("ENTITIES" == ainfo.type) {
1135:         StringTokenizer    tokens = new StringTokenizer (value, " ");
1136: 
1137:         if (!tokens.hasMoreTokens ()) {
1138:             // VC: Entity Name (one or more values)
1139:             error ("ENTITIES attribute " + aname
1140:             + " must have at least one name token");
1141:         } else do {
1142:             String entity = tokens.nextToken ();
1143: 
1144:             if (!unparsed.contains (entity))
1145:             // VC: Entity Name
1146:             error ("Value of attribute '" + aname
1147:                 + "' refers to unparsed entity '" + entity
1148:                 + "' which was not declared.");
1149:         } while (tokens.hasMoreTokens ());
1150:         continue;
1151:         }
1152: 
1153:         //
1154:         // check for enumerations last; more expensive
1155:         //
1156:         if (ainfo.type.charAt (0) == '(' /*)*/    
1157:             || ainfo.type.startsWith ("NOTATION ")
1158:             ) {
1159:         // VC: Enumeration (value must be defined)
1160:         checkEnumeration (value, ainfo.type, aname);
1161:         continue;
1162:         }
1163:     }
1164: 
1165:     //
1166:     // Last, check that all #REQUIRED attributes were provided
1167:     //
1168:     if (info != null) {
1169:         Hashtable    table = info.attributes;
1170: 
1171:         if (table.size () != 0) {
1172:         Enumeration    e = table.keys ();
1173: 
1174:         // XXX table.keys uses the heap, bleech -- slows things
1175: 
1176:         while (e.hasMoreElements ()) {
1177:             aname = (String) e.nextElement ();
1178:             ainfo = (AttributeInfo) table.get (aname);
1179: 
1180:             // "#REQUIRED" mode was interned in attributeDecl
1181:             if ("#REQUIRED" == ainfo.mode
1182:                 && atts.getValue (aname) == null) {
1183:             // VC: Required Attribute
1184:             error ("Attribute '" + aname + "' must be specified "
1185:                 + "for element type " + qName);
1186:             }
1187:         }
1188:         }
1189:     }
1190:     super.startElement (uri, localName, qName, atts);
1191:     }
1192: 
1193:     /**
1194:      * <b>ContentHandler</b> Reports a validity error if the element's content
1195:      * model does not permit character data.
1196:      * Passed to the next consumer.
1197:      */
1198:     public void characters (char ch [], int start, int length)
1199:     throws SAXException
1200:     {
1201:     Recognizer state;
1202: 
1203:     if (contentStack.empty ())
1204:         state = null;
1205:     else
1206:         state = (Recognizer) contentStack.peek ();
1207: 
1208:     // NOTE:  if this ever supports with SAX parsers that don't
1209:     // report ignorable whitespace as such (only XP?), this class
1210:     // needs to morph it into ignorableWhitespace() as needed ...
1211: 
1212:     if (state != null && !state.acceptCharacters ())
1213:         // VC: Element Valid (clauses three, four -- see recognizer)
1214:         error ("Character content not allowed in element "
1215:         + state.type.name);
1216:     
1217:     super.characters (ch, start, length);
1218:     }
1219:     
1220: 
1221:     /**
1222:      * <b>ContentHandler</b> Reports a validity error if the element's content
1223:      * model does not permit end-of-element yet, or a well formedness error
1224:      * if there was no matching startElement call.
1225:      * Passed to the next consumer.
1226:      */
1227:     public void endElement (String uri, String localName, String qName)
1228:     throws SAXException
1229:     {
1230:     try {
1231:         Recognizer state = (Recognizer) contentStack.pop ();
1232: 
1233:         if (state != null && !state.completed ())
1234:         // VC: Element valid (clauses two, three, four; see Recognizer)
1235:         error ("Premature end for element '"
1236:             + state.type.name
1237:             + "', content model "
1238:             + state.type.model);
1239:         
1240:         // could insist on match of start element, but that's
1241:         // something the input stream must to guarantee.
1242: 
1243:     } catch (EmptyStackException e) {
1244:         fatalError ("endElement without startElement: " + qName
1245:         + ((uri == null)
1246:             ? ""
1247:             : ( " { '" + uri + "', " + localName + " }")));
1248:     }
1249:     super.endElement (uri, localName, qName);
1250:     }
1251: 
1252:     /**
1253:      * <b>ContentHandler</b> Checks whether all ID values that were
1254:      * referenced have been declared, and releases all resources. 
1255:      * Passed to the next consumer.
1256:      * 
1257:      * @see #setDocumentLocator
1258:      */
1259:     public void endDocument ()
1260:     throws SAXException
1261:     {
1262:     for (Enumeration idNames = ids.keys ();
1263:         idNames.hasMoreElements ();
1264:         /* NOP */) {
1265:         String id = (String) idNames.nextElement ();
1266:         
1267:         if (Boolean.FALSE == ids.get (id)) {
1268:         // VC: IDREF (must match ID)
1269:         error ("Undeclared ID value '" + id
1270:             + "' was referred to by an IDREF/IDREFS attribute");
1271:         }
1272:     }
1273: 
1274:     resetState ();
1275:     super.endDocument ();
1276:     }
1277: 
1278: 
1279:     /** Holds per-element declarations */
1280:     static private final class ElementInfo
1281:     {
1282:     String            name;
1283:     String            model;
1284: 
1285:     // key = attribute name; value = AttributeInfo
1286:     Hashtable        attributes = new Hashtable (11);
1287: 
1288:     ElementInfo (String n) { name = n; }
1289: 
1290:     private Recognizer    recognizer;
1291: 
1292:     // for validating content models:  one per type, shared,
1293:     // and constructed only on demand ... so unused elements do
1294:     // not need to consume resources.
1295:     Recognizer    getRecognizer (ValidationConsumer consumer)
1296:     throws SAXException
1297:     {
1298:         if (recognizer == null) {
1299:         if ("ANY".equals (model))
1300:             recognizer = ANY;
1301:         else if ("EMPTY".equals (model))
1302:             recognizer = new EmptyRecognizer (this);
1303:         else if ('#' == model.charAt (1))
1304:             // n.b. this constructor does a validity check
1305:             recognizer = new MixedRecognizer (this, consumer);
1306:         else
1307:             recognizer = new ChildrenRecognizer (this, consumer);
1308:         }
1309:         return recognizer;
1310:     }
1311:     }
1312: 
1313:     /** Holds per-attribute declarations */
1314:     static private final class AttributeInfo
1315:     {
1316:     String    type;
1317:     String    mode;        // #REQUIRED, etc (or null)
1318:     String    value;        // or null
1319:     }
1320: 
1321: 
1322:     //
1323:     // Content model validation
1324:     //
1325: 
1326:     static private final Recognizer    ANY = new Recognizer (null);
1327: 
1328: 
1329:     // Base class defines the calls used to validate content,
1330:     // and supports the "ANY" content model
1331:     static private class Recognizer
1332:     {
1333:     final ElementInfo    type;
1334: 
1335:     Recognizer (ElementInfo t) { type = t; }
1336: 
1337:     // return true iff character data is legal here
1338:     boolean acceptCharacters ()
1339:     throws SAXException
1340:         // VC: Element Valid (third and fourth clauses)
1341:         { return true; }
1342: 
1343:     // null return = failure
1344:     // otherwise, next state (like an FSM)
1345:     // prerequisite: tested that name was declared
1346:     Recognizer acceptElement (String name)
1347:     throws SAXException
1348:         // VC: Element Valid (fourth clause)
1349:         { return this; }
1350: 
1351:     // return true iff model is completed, can finish
1352:     boolean completed ()
1353:     throws SAXException
1354:         // VC: Element Valid (fourth clause)
1355:         { return true; }
1356:     
1357:     public String toString ()
1358:         // n.b. "children" is the interesting case!
1359:         { return (type == null) ? "ANY" : type.model; }
1360:     }
1361: 
1362:     // "EMPTY" content model -- no characters or elements
1363:     private static final class EmptyRecognizer extends Recognizer
1364:     {
1365:     public EmptyRecognizer (ElementInfo type)
1366:         { super (type); }
1367: 
1368:     // VC: Element Valid (first clause)
1369:     boolean acceptCharacters ()
1370:         { return false; }
1371: 
1372:     // VC: Element Valid (first clause)
1373:     Recognizer acceptElement (String name)
1374:         { return null; }
1375:     }
1376: 
1377:     // "Mixed" content model -- ANY, but restricts elements
1378:     private static final class MixedRecognizer extends Recognizer
1379:     {
1380:     private String    permitted [];
1381: 
1382:     // N.B. constructor tests for duplicated element names (VC)
1383:     public MixedRecognizer (ElementInfo t, ValidationConsumer v)
1384:     throws SAXException
1385:     {
1386:         super (t);
1387: 
1388:         // (#PCDATA...)* or (#PCDATA) ==> ... or empty
1389:         // with the "..." being "|elname|..."
1390:         StringTokenizer    tokens = new StringTokenizer (
1391:         t.model.substring (8, t.model.lastIndexOf (')')),
1392:         "|");
1393:         Vector        vec = new Vector ();
1394: 
1395:         while (tokens.hasMoreTokens ()) {
1396:         String token = tokens.nextToken ();
1397: 
1398:         if (vec.contains (token))
1399:             v.error ("element " + token
1400:             + " is repeated in mixed content model: "
1401:             + t.model);
1402:         else
1403:             vec.addElement (token.intern ());
1404:         }
1405:         permitted = new String [vec.size ()];
1406:         for (int i = 0; i < permitted.length; i++)
1407:         permitted [i] = (String) vec.elementAt (i);
1408:         
1409:         // in one large machine-derived DTD sample, most of about
1410:         // 250 mixed content models were empty, and 25 had ten or
1411:         // more entries.  2 had over a hundred elements.  Linear
1412:         // search isn't obviously wrong.
1413:     }
1414: 
1415:     // VC: Element Valid (third clause)
1416:     Recognizer acceptElement (String name)
1417:     {
1418:         int        length = permitted.length;
1419: 
1420:         // first pass -- optimistic w.r.t. event source interning
1421:         // (and document validity)
1422:         for (int i = 0; i < length; i++)
1423:         if (permitted [i] == name)
1424:             return this;
1425:         // second pass -- pessimistic w.r.t. event source interning
1426:         for (int i = 0; i < length; i++)
1427:         if (permitted [i].equals (name))
1428:             return this;
1429:         return null;
1430:     }
1431:     }
1432: 
1433: 
1434:     // recognizer loop flags, see later
1435:     private static final int        F_LOOPHEAD = 0x01;
1436:     private static final int        F_LOOPNEXT = 0x02;
1437: 
1438:     // for debugging -- used to label/count nodes in toString()
1439:     private static int            nodeCount;
1440: 
1441:     /**
1442:      * "Children" content model -- these are nodes in NDFA state graphs.
1443:      * They work in fixed space.  Note that these graphs commonly have
1444:      * cycles, handling features such as zero-or-more and one-or-more.
1445:      *
1446:      * <p>It's readonly, so only one copy is ever needed.  The content model
1447:      * stack may have any number of pointers into each graph, when a model
1448:      * happens to be needed more than once due to element nesting.  Since
1449:      * traversing the graph just moves to another node, and never changes
1450:      * it, traversals never interfere with each other.
1451:      *
1452:      * <p>There is an option to report non-deterministic models.  These are
1453:      * always XML errors, but ones which are not often reported despite the
1454:      * fact that they can lead to different validating parsers giving
1455:      * different results for the same input.  (The XML spec doesn't require
1456:      * them to be reported.)
1457:      *
1458:      * <p><b>FIXME</b> There's currently at least one known bug here, in that
1459:      * it's not actually detecting the non-determinism it tries to detect.
1460:      * (Of the "optional.xml" test, the once-or-twice-2* tests are all non-D;
1461:      * maybe some others.)  This may relate to the issue flagged below as
1462:      * "should not" happen (but it was), which showed up when patching the
1463:      * graph to have one exit node (or more EMPTY nodes).
1464:      */
1465:     private static final class ChildrenRecognizer extends Recognizer
1466:     implements Cloneable
1467:     {
1468:     // for reporting non-deterministic content models
1469:     // ... a waste of space if we're not reporting those!
1470:     // ... along with the 'model' member (in base class)
1471:     private ValidationConsumer    consumer;
1472: 
1473:     // for CHOICE nodes -- each component is an arc that
1474:     // accepts a different NAME (or is EMPTY indicating
1475:     // NDFA termination).
1476:     private Recognizer        components [];
1477: 
1478:     // for NAME/SEQUENCE nodes -- accepts that NAME and
1479:     // then goes to the next node (CHOICE, NAME, EMPTY).
1480:     private String            name;
1481:     private Recognizer        next;
1482: 
1483:     // loops always point back to a CHOICE node. we mark such choice
1484:     // nodes (F_LOOPHEAD) for diagnostics and faster deep cloning.
1485:     // We also mark nodes before back pointers (F_LOOPNEXT), to ensure
1486:     // termination when we patch sequences and loops.
1487:     private int            flags;
1488: 
1489: 
1490:     // prevent a needless indirection between 'this' and 'node'
1491:     private void copyIn (ChildrenRecognizer node)
1492:     {
1493:         // model & consumer are already set
1494:         components = node.components;
1495:         name = node.name;
1496:         next = node.next;
1497:         flags = node.flags;
1498:     }
1499: 
1500:     // used to construct top level "children" content models,
1501:     public ChildrenRecognizer (ElementInfo type, ValidationConsumer vc)
1502:     {
1503:         this (vc, type);
1504:         populate (type.model.toCharArray (), 0);
1505:         patchNext (new EmptyRecognizer (type), null);
1506:     }
1507: 
1508:     // used internally; populating is separate
1509:     private ChildrenRecognizer (ValidationConsumer vc, ElementInfo type)
1510:     {
1511:         super (type);
1512:         consumer = vc;
1513:     }
1514: 
1515: 
1516:     //
1517:     // When rewriting some graph nodes we need deep clones in one case;
1518:     // mostly shallow clones (what the JVM handles for us) are fine.
1519:     //
1520:     private ChildrenRecognizer shallowClone ()
1521:     {
1522:         try {
1523:         return (ChildrenRecognizer) clone ();
1524:         } catch (CloneNotSupportedException e) {
1525:         throw new Error ("clone");
1526:         }
1527:     }
1528: 
1529:     private ChildrenRecognizer deepClone ()
1530:     {
1531:         return deepClone (new Hashtable (37));
1532:     }
1533: 
1534:     private ChildrenRecognizer deepClone (Hashtable table)
1535:     {
1536:         ChildrenRecognizer retval;
1537: 
1538:         if ((flags & F_LOOPHEAD) != 0) {
1539:         retval = (ChildrenRecognizer) table.get (this);
1540:         if (retval != null)
1541:             return this;
1542: 
1543:         retval = shallowClone ();
1544:         table.put (this, retval);
1545:         } else
1546:         retval = shallowClone ();
1547: 
1548:         if (next != null) {
1549:         if (next instanceof ChildrenRecognizer)
1550:             retval.next = ((ChildrenRecognizer)next)
1551:                 .deepClone (table);
1552:         else if (!(next instanceof EmptyRecognizer))
1553:             throw new RuntimeException ("deepClone");
1554:         }
1555: 
1556:         if (components != null) {
1557:         retval.components = new Recognizer [components.length];
1558:         for (int i = 0; i < components.length; i++) {
1559:             Recognizer temp = components [i];
1560: 
1561:             if (temp == null)
1562:             retval.components [i] = null;
1563:             else if (temp instanceof ChildrenRecognizer)
1564:             retval.components [i] = ((ChildrenRecognizer)temp)
1565:                 .deepClone (table);
1566:             else if (!(temp instanceof EmptyRecognizer))
1567:             throw new RuntimeException ("deepClone");
1568:         }
1569:         }
1570: 
1571:         return retval;
1572:     }
1573: 
1574:     // connect subgraphs, first to next (sequencing)
1575:     private void patchNext (Recognizer theNext, Hashtable table)
1576:     {
1577:         // backpointers must not be repatched or followed
1578:         if ((flags & F_LOOPNEXT) != 0)
1579:         return;
1580: 
1581:         // XXX this table "shouldn't" be needed, right?
1582:         // but some choice nodes looped if it isn't there.
1583:         if (table != null && table.get (this) != null)
1584:         return;
1585:         if (table == null)
1586:         table = new Hashtable ();
1587: 
1588:         // NAME/SEQUENCE
1589:         if (name != null) {
1590:         if (next == null)
1591:             next = theNext;
1592:         else if (next instanceof ChildrenRecognizer) {
1593:             ((ChildrenRecognizer)next).patchNext (theNext, table);
1594:         } else if (!(next instanceof EmptyRecognizer))
1595:             throw new RuntimeException ("patchNext");
1596:         return;
1597:         }
1598: 
1599:         // CHOICE
1600:         for (int i = 0; i < components.length; i++) {
1601:         if (components [i] == null)
1602:             components [i] = theNext;
1603:         else if (components [i] instanceof ChildrenRecognizer) {
1604:             ((ChildrenRecognizer)components [i])
1605:                 .patchNext (theNext, table);
1606:         } else if (!(components [i] instanceof EmptyRecognizer))
1607:             throw new RuntimeException ("patchNext");
1608:         }
1609: 
1610:         if (table != null && (flags & F_LOOPHEAD) != 0)
1611:         table.put (this, this);
1612:     }
1613: 
1614:     /**
1615:      * Parses a 'children' spec (or recursively 'cp') and makes this
1616:      * become a regular graph node.
1617:      *
1618:      * @return index after this particle
1619:      */
1620:     private int populate (char parseBuf [], int startPos)
1621:     {
1622:         int        nextPos = startPos + 1;
1623:         char    c;
1624: 
1625:         if (nextPos < 0 || nextPos >= parseBuf.length)
1626:         throw new IndexOutOfBoundsException ();
1627: 
1628:         // Grammar of the string is from the XML spec, but
1629:         // with whitespace removed by the SAX parser.
1630: 
1631:         // children ::= (choice | seq) ('?' | '*' | '+')?
1632:         // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1633:         // choice ::= '(' cp ('|' choice)* ')'
1634:         // seq ::= '(' cp (',' choice)* ')'
1635: 
1636:         // interior nodes only
1637:         //   cp ::= name ...
1638:         if (parseBuf [startPos] != '('/*)*/) {
1639:         boolean        done = false;
1640:         do {
1641:             switch (c = parseBuf [nextPos]) {
1642:             case '?': case '*': case '+':
1643:             case '|': case ',':
1644:             case /*(*/ ')':
1645:                 done = true;
1646:                 continue;
1647:             default:
1648:                 nextPos++;
1649:                 continue;
1650:             }
1651:         } while (!done);
1652:         name = new String (parseBuf, startPos, nextPos - startPos);
1653: 
1654:         // interior OR toplevel nodes
1655:         //   cp ::= choice ..
1656:         //   cp ::= seq ..
1657:         } else {
1658:         // collect everything as a separate list, and merge it
1659:         // into "this" later if we can (SEQUENCE or singleton)
1660:         ChildrenRecognizer    first;
1661:            
1662:         first = new ChildrenRecognizer (consumer, type);
1663:         nextPos = first.populate (parseBuf, nextPos);
1664:         c = parseBuf [nextPos++];
1665: 
1666:         if (c == ',' || c == '|') {
1667:             ChildrenRecognizer    current = first;
1668:             char        separator = c;
1669:             Vector        v = null;
1670: 
1671:             if (separator == '|') {
1672:             v = new Vector ();
1673:             v.addElement (first);
1674:             }
1675: 
1676:             do {
1677:             ChildrenRecognizer link;
1678: 
1679:             link = new ChildrenRecognizer (consumer, type);
1680:             nextPos = link.populate (parseBuf, nextPos);
1681: 
1682:             if (separator == ',') {
1683:                 current.patchNext (link, null);
1684:                 current = link;
1685:             } else
1686:                 v.addElement (link);
1687: 
1688:             c = parseBuf [nextPos++];
1689:             } while (c == separator);
1690: 
1691:             // choice ... collect everything into one array.
1692:             if (separator == '|') {
1693:             // assert v.size() > 1
1694:             components = new Recognizer [v.size ()];
1695:             for (int i = 0; i < components.length; i++) {
1696:                 components [i] = (Recognizer)
1697:                     v.elementAt (i);
1698:             }
1699:             // assert flags == 0
1700: 
1701:             // sequence ... merge into "this" to be smaller.
1702:             } else
1703:             copyIn (first);
1704: 
1705:         // treat singletons like one-node sequences.
1706:         } else
1707:             copyIn (first);
1708: 
1709:         if (c != /*(*/ ')')
1710:             throw new RuntimeException ("corrupt content model");
1711:         }
1712: 
1713:         //
1714:         // Arity is optional, and the root of all fun.  We keep the
1715:         // FSM state graph simple by only having NAME/SEQUENCE and
1716:         // CHOICE nodes (or EMPTY to terminate a model), easily
1717:         // evaluated.  So we rewrite each node that has arity, using
1718:         // those primitives.  We create loops here, if needed.
1719:         //
1720:         if (nextPos < parseBuf.length) {
1721:         c = parseBuf [nextPos];
1722:         if (c == '?' || c == '*' || c == '+') {
1723:             nextPos++;
1724: 
1725:             // Rewrite 'zero-or-one' "?" arity to a CHOICE:
1726:             //   - SEQUENCE (clone, what's next)
1727:             //   - or, what's next
1728:             // Size cost: N --> N + 1
1729:             if (c == '?') {
1730:             Recognizer        once = shallowClone ();
1731: 
1732:             components = new Recognizer [2];
1733:             components [0] = once;
1734:             // components [1] initted to null
1735:             name = null;
1736:             next = null;
1737:             flags = 0;
1738: 
1739:                 
1740:             // Rewrite 'zero-or-more' "*" arity to a CHOICE.
1741:             //   - LOOP (clone, back to this CHOICE)
1742:             //   - or, what's next
1743:             // Size cost: N --> N + 1
1744:             } else if (c == '*') {
1745:             ChildrenRecognizer    loop = shallowClone ();
1746: 
1747:             loop.patchNext (this, null);
1748:             loop.flags |= F_LOOPNEXT;
1749:             flags = F_LOOPHEAD;
1750: 
1751:             components = new Recognizer [2];
1752:             components [0] = loop;
1753:             // components [1] initted to null
1754:             name = null;
1755:             next = null;
1756: 
1757: 
1758:             // Rewrite 'one-or-more' "+" arity to a SEQUENCE.
1759:             // Basically (a)+ --> ((a),(a)*).
1760:             //   - this
1761:             //   - CHOICE
1762:             //        * LOOP (clone, back to the CHOICE)
1763:             //        * or, whatever's next
1764:             // Size cost: N --> 2N + 1
1765:             } else if (c == '+') {
1766:             ChildrenRecognizer loop = deepClone ();
1767:             ChildrenRecognizer choice;
1768: 
1769:             choice = new ChildrenRecognizer (consumer, type);
1770:             loop.patchNext (choice, null);
1771:             loop.flags |= F_LOOPNEXT;
1772:             choice.flags = F_LOOPHEAD;
1773: 
1774:             choice.components = new Recognizer [2];
1775:             choice.components [0] = loop;
1776:             // choice.components [1] initted to null
1777:             // choice.name, choice.next initted to null
1778: 
1779:             patchNext (choice, null);
1780:             }
1781:         }
1782:         }
1783: 
1784:         return nextPos;
1785:     }
1786: 
1787:     // VC: Element Valid (second clause)
1788:     boolean acceptCharacters ()
1789:         { return false; }
1790: 
1791:     // VC: Element Valid (second clause)
1792:     Recognizer acceptElement (String type)
1793:     throws SAXException
1794:     {
1795:         // NAME/SEQUENCE
1796:         if (name != null) {
1797:         if (name.equals (type))
1798:             return next;
1799:         return null;
1800:         }
1801: 
1802:         // CHOICE ... optionally reporting nondeterminism we
1803:         // run across.  we won't check out every transition
1804:         // for nondeterminism; only the ones we follow.
1805:         Recognizer    retval = null;
1806: 
1807:         for (int i = 0; i < components.length; i++) {
1808:         Recognizer temp = components [i].acceptElement (type);
1809: 
1810:         if (temp == null)
1811:             continue;
1812:         else if (!warnNonDeterministic)
1813:             return temp;
1814:         else if (retval == null)
1815:             retval = temp;
1816:         else if (retval != temp)
1817:             consumer.error ("Content model " + this.type.model
1818:             + " is non-deterministic for " + type);
1819:         }
1820:         return retval;
1821:     }
1822: 
1823:     // VC: Element Valid (second clause)
1824:     boolean completed ()
1825:     throws SAXException
1826:     {
1827:         // expecting a specific element
1828:         if (name != null)
1829:         return false;
1830:         
1831:         // choice, some sequences
1832:         for (int i = 0; i < components.length; i++) {
1833:         if (components [i].completed ())
1834:             return true;
1835:         }
1836: 
1837:         return false;
1838:     }
1839: 
1840: /** /
1841:     // FOR DEBUGGING ... flattens the graph for printing.
1842: 
1843:     public String toString ()
1844:     {
1845:         StringBuffer buf = new StringBuffer ();
1846: 
1847:         // only one set of loop labels can be generated
1848:         // at a time...
1849:         synchronized (ANY) {
1850:         nodeCount = 0;
1851: 
1852:         toString (buf, new Hashtable ());
1853:         return buf.toString ();
1854:         }
1855:     }
1856: 
1857:     private void toString (StringBuffer buf, Hashtable table)
1858:     {
1859:         // When we visit a node, label and count it.
1860:         // Nodes are never visited/counted more than once.
1861:         // For small models labels waste space, but if arity
1862:         // mappings were used the savings are substantial.
1863:         // (Plus, the output can be more readily understood.)
1864:         String temp = (String) table.get (this);
1865: 
1866:         if (temp != null) {
1867:         buf.append ('{');
1868:         buf.append (temp);
1869:         buf.append ('}');
1870:         return;
1871:         } else {
1872:         StringBuffer scratch = new StringBuffer (15);
1873: 
1874:         if ((flags & F_LOOPHEAD) != 0)
1875:             scratch.append ("loop");
1876:         else
1877:             scratch.append ("node");
1878:         scratch.append ('-');
1879:         scratch.append (++nodeCount);
1880:         temp = scratch.toString ();
1881: 
1882:         table.put (this, temp);
1883:         buf.append ('[');
1884:         buf.append (temp);
1885:         buf.append (']');
1886:         buf.append (':');
1887:         }
1888: 
1889:         // NAME/SEQUENCE
1890:         if (name != null) {
1891:         // n.b. some output encodings turn some name chars into '?'
1892:         // e.g. with Japanese names and ASCII output
1893:         buf.append (name);
1894:         if (components != null)        // bug!
1895:             buf.append ('$');
1896:         if (next == null)
1897:             buf.append (",*");
1898:         else if (next instanceof EmptyRecognizer) // patch-to-next
1899:             buf.append (",{}");
1900:         else if (next instanceof ChildrenRecognizer) {
1901:             buf.append (',');
1902:             ((ChildrenRecognizer)next).toString (buf, table);
1903:         } else                // bug!
1904:             buf.append (",+");
1905:         return;
1906:         }
1907: 
1908:         // CHOICE
1909:         buf.append ("<");
1910:         for (int i = 0; i < components.length; i++) {
1911:         if (i != 0)
1912:             buf.append ("|");
1913:         if (components [i] instanceof EmptyRecognizer) {
1914:             buf.append ("{}");
1915:         } else if (components [i] == null) {    // patch-to-next
1916:             buf.append ('*');
1917:         } else {
1918:             ChildrenRecognizer r;
1919: 
1920:             r = (ChildrenRecognizer) components [i];
1921:             r.toString (buf, table);
1922:         }
1923:         }
1924:         buf.append (">");
1925:     }
1926: /**/
1927:     }
1928: }