The SAX Parser locator Facility
Part 3

Russell Bateman
last update:




This is yet another, working sample of maintaining location in a SAX parsing handler That is, getting accurate line/column numbers out of it. Scraping this to start from should give you a leg-up on your SAX parser whatever its purpose.

What's missing? Well, pretty quickly—depending on what you need a SAX parser for—you'll need some kind of stack storage and management. I have some examples of that elsewhere.

Analyzer.java:
package com.windofkeltia.processor;

import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.SAXException;

import com.windofkeltia.sax.Position;

public class Analyzer
{
  private static final SAXParserFactory factory = SAXParserFactory.newInstance();

  private final InputStream     inputStream;
  private final SAXParser       parser;
  private final AnalyzerHandler handler;

  /**
   * Look for an element, begin displaying output (which is just parsing status)
   * until that element's close is found, then stop displaying output again.
   */
  public Analyzer( InputStream flowfile, final String contentElement )
      throws ParserConfigurationException, SAXException
  {
    parser      = factory.newSAXParser();
    inputStream = flowfile;
    handler     = new AnalyzerHandler( contentElement );
  }

  public void parse() throws IOException, SAXException
  {
    parser.parse( inputStream, handler );
  }

  public Position getStart() { return handler.getStart(); }
  public Position getEnd()   { return handler.getEnd(); }
}
AnalyzerHandler.java

Each entry point (handler method) prints out status including line- and column number and the element name (and attributes if relevant).

package com.windofkeltia.processor;

import java.util.HashMap;
import java.util.Map;

import static java.util.Objects.isNull;

import org.xml.sax.Attributes;
import org.xml.sax.Locator;
import org.xml.sax.helpers.DefaultHandler;

import com.windofkeltia.sax.Position;
import com.windofkeltia.utilities.StringUtilities;

public class AnalyzerHandler extends DefaultHandler
{
  private       Locator  locator;
  private       Position position = new Position(); // starting element position we maintain
  private       Position start    = null;
  private       Position end      = null;
  private       boolean  outputOn = false;          // allows us to start/stop some (output) action
  private final String   contentElement;

  public AnalyzerHandler( final String contentElement )
  {
    super();
    this.contentElement = contentElement;
    System.out.println( "AnalyzerHandler():" );
  }

  public void startElement( String uri, String localName, String elementName, Attributes attributes )
  {
    if( elementName.equals( contentElement ) )
    {
      outputOn = true;
      return;
    }

    if( outputOn )
    {
      if( isNull( start ) )
        start = position;

      StringBuilder sb = new StringBuilder();
      sb.append( position.line ).append( ',' ).append( position.column ).append( ' ' );
      Map< String, String > javaAttributes = getAttributesAsJavaMap( attributes );
      sb.append( elementName );
      System.out.println( sb );
    }
  }

  public void endElement( String uri, String localName, String elementName )
  {
    Position start = position;
    Position end   = new Position( locator.getLineNumber(), locator.getColumnNumber() );

    if( elementName.equals( contentElement ) )
      outputOn = false;

    if( !outputOn )
      return;

    if( isNull( end ) )
      end = position;

    StringBuilder sb = new StringBuilder();
    sb.append( position.line ).append( ',' ).append( position.column ).append( ' ' );
    sb.append( elementName );
    System.out.println( sb );

    // update the starting point for the next element
    updateElementPoint( locator );
  }

  public void characters( char[] ch, int start, int length )
  {
    updateElementPoint( locator );  // now update the starting point

    if( !outputOn )
      return;

    StringBuilder sb         = new StringBuilder();
    String        characters = new String( ch, start, length ).trim();
    if( !StringUtilities.isEmpty( characters ) )
    {
      sb.append( position.line ).append( ',' ).append( position.column ).append( ' ' );
      sb.append( '\"' ).append( characters.trim() ).append( '\"' );
      System.out.println( sb );
    }
  }

  public void comment( char[] ch, int start, int length )
  {
    if( !outputOn )
      return;

    String comment = new String( ch, start, length );
    System.out.println( "        comment(): \"" + comment.trim() + "\"" );
  }

  public void startDocument() { if( outputOn ) System.out.println( "  startDocument():" ); }
  public void endDocument()   { if( outputOn ) System.out.println( "    endDocument():" ); }

  public void setDocumentLocator( Locator location ) { locator = location; }

  public Position getStart() { return start; }
  public Position getEnd()   { return end; }

  private void updateElementPoint( Locator locator )
  {
    Position location = new Position( locator.getLineNumber(), locator.getColumnNumber() );
    if( position.compareTo( location ) < 0 )
      position = location;
  }

  /**
   * Here's how to make SAX' attributes "Java-useful." If we had uri (namespaces) defined,
   * we'd have to get a lot more serious about how to use uri, localName and qName.
   */
  private Map< String, String > getAttributesAsJavaMap( Attributes saxAttributes )
  {
    int                   attrLength = saxAttributes.getLength();
    Map< String, String > javaAttributes = new HashMap<>( attrLength );

    for( int attr = 0; attr < attrLength; attr++ )
    {
      String attribute = saxAttributes.getQName( attr );
      String value     = saxAttributes.getValue( attr );
      javaAttributes.put( attribute, value );
    }

    return javaAttributes;
  }

  private String javaAttributesAsString( Map< String, String > javaAttributes )
  {
    if( javaAttributes.size() == 0 )
      return "";

    StringBuilder sb = new StringBuilder();
    for( Map.Entry< String, String > attribute : javaAttributes.entrySet() )
      sb.append( attribute.getKey() )
        .append( "=\"" )
        .append( StringUtilities.smash( attribute.getValue() ) )
        .append( "\", " );
    sb.setLength( sb.length()-2 );
    return sb.toString();
  }
}
Position.java
package com.windofkeltia.sax;

public class Position
{
  public int line;
  public int column;

  public Position()                       { this.line   = 1;    this.column = 1; }
  public Position( int line, int column ) { this.line   = line; this.column = column; }
  public void setLine( int line )         { this.line   = line; }
  public void setColumn( int column )     { this.column = column; }

  public int getLine()                    { return line; }
  public int getColumn()                  { return column; }

  public int compareTo( Position position )
  {
    // if our location is past recorded line...
    if( position.getLine() > getLine() )
      return -1;
      // if on recorded line, but past recorded column...
    else if( position.getLine() == getLine() && position.getColumn() > getColumn() )
      return -1;
      // if on recorded line and also at recorded column...
    else if( position.getLine() == getLine() && position.getColumn() == getColumn() )
      return 0;
      // we're before current line and/or current column...
    else
      return 1;
  }
}
StringUtilities.java
  public static boolean isEmpty( String string ) { return( string == null || string.length() < 1 ); }


  /**
   * Remove all newlines, multiple spaces, tabs, etc. to neutralize this string. This is helpful
   * when comparing test results where white space doesn't count. This does not remove all
   * spaces--only multiple, adjacent ones.
   */
  public static String smash( String string )
  {
    if( isEmpty( string ) )
      return string;

    String result  = "";
    String smashed = string.replaceAll( "\t", " " ).replaceAll( "\n", "" );
    int    length  = smashed.length();

    for( int ch = 0; ch < length; ch++ )
    {
      char c = smashed.charAt( ch );

      if( c == ' ' )
      {
        while( ch+1 < length && smashed.charAt( ch+1 ) == ' ' )
          ch++;
        result += ' ';
      }
      else
      {
        result += c;
      }
    }

    return result;
  }
AnalyzerTest.java:

Much transformation (to simplfy) went into this so I don't guarantee it works as-is. Anyway, the output is supposed to be (simply) what's in CCD_BODY.

package com.windofkeltia.processor;

import java.util.List;
import javax.xml.parsers.ParserConfigurationException;

import org.xml.sax.SAXException;

import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;

import static org.junit.Assert.assertEquals;

public class AnalyzerTest
{
  @After  public void tearDown() { }
  @Before public void setUp() { TestUtilities.setUp( name ); }

  private static final boolean VERBOSE  = true;

  private static final String CCD_BODY = ""
      + "<ClinicalDocument xmlns=\"urn:hl7-org:v3\" xmlns:sdtc=\"urn:hl7-org:sdtc\"\n"
      + "                  xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n"
      + "                  xsi:schemaLocation=\"urn:hl7-org:v3\n"
      + "                                       http://xreg2.nist.gov:8080/hitspValidation/schema/cdar2c32/infrastructure/cda/C32_CDA.xsd\">\n"
      + "  <realmCode code=\"US\"/>\n"
      + "  <typeId root=\"2.16.840.1.113883.1.3\" extension=\"POCD_HD000040\"/>\n"
      + "  <component>\n"
      + "    <stuff>\n"
      + "      ...\n"
      + "    </stuff>\n"
      + "  </component>\n"
      + "</ClinicalDocument>\n";

  private static final String FROM_JDBCTOXML = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
      + "<record>\n"
      + "  <justforfun>\n"
      + "    ...\n"
      + "  </justforfun>\n"
      + "  <ccdmessage>\n"
      + CCD_BODY
      + "  </ccdmessage>\n"
      + "</record>\n";


  @Test
  public void testAnalyzer() throws ParserConfigurationException, SAXException, IOException
  {
    String[] content = FROM_JDBCTOXML.split( "\n" );
    System.out.println( "Input:" );
    int lineNumber = 1;
    for( String line : content )
      System.out.println( lineNumber++ + " " + line );
    System.out.println();

    Analyzer analyzer = new Analyzer( new ByteArrayInputStream( FROM_JDBCTOXML.getBytes() ), "ccdmessage" );
    analyzer.parse();
  }
}