NiFi custom processor

This is the coded content of a custom processor. It's consciously a beginner's tutorial on writing a custom processor, tests for same and debugging it. Please see Notes on a simple NiFi custom processor project for building it.

Writing a custom NiFi processor

The heavy lifting in a NiFi processor is done by the onTrigger() method. There are several different constructs for writing NiFi processors. We will show them below. However, first, here's how most of them start out:

Constants.java:

ExtractBigXml.java:

package com.windofkeltia.processor;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import org.apache.nifi.annotation.behavior.TriggerSerially;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.annotation.lifecycle.OnStopped;
import org.apache.nifi.annotation.lifecycle.OnUnscheduled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.io.InputStreamCallback;
import org.apache.nifi.processor.io.OutputStreamCallback;

import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.xml.StaxDriver;

import com.windofkeltia.pojos.Pojo;
import com.windofkeltia.utilities.KMPStreamMatch;

/**
 * @author Russell Bateman
 * @since August 2020
 */
@TriggerSerially
@CapabilityDescription( "Extracts data from an XML producing serialized POJOs. Copy the XML's excerpt document to excerpt as a"
                + " separate output flowfile. The two are \"bound\" together by the original 'uuid' of the incoming flowfile"
                + " retained on the serialized POJO flowfile created and the annotation of attribute '" + EXCERPT-DOCUMENT
                + "' on the excerpt flowfile." )
@WritesAttributes(
    { @WritesAttribute( attribute   = EXCERPT_DOCUMENT,
                   description = "Both flowfiles on success will have this discrete value in their '" + EXCERPT_DOCUMENT
                            + "' attribute." )
    } )
public class ExtractBigXml extends AbstractProcessor
{
  private static final String EXCERPT_START_ELEMENT = "<excerpt ";
  private static final String EXCERPT_END_ELEMENT   = "</excerpt>";

  @Override public void onTrigger( final ProcessContext context, final ProcessSession session ) throws ProcessException
  {
    FlowFile flowfile = session.get();
    FlowFile excerpt  = session.clone( flowfile );
    FlowFile pojos    = session.clone( excerpt );

    try
    {
      final String UUID = flowfile.getAttribute( "uuid" );

      // write the original excerpt to a new flowfile...
      session.write( excerpt, new OutputStreamCallback()
      {
        @Override public void process( OutputStream outputStream )
        {
          // read from the original flowfile copying to the output flowfile...
          session.read( flowfile, new InputStreamCallback()
          {
            @Override public void process( InputStream inputStream ) throws IOException
            {
              OutputStream   ignore = new ByteArrayOutputStream(); // (place to write the before stuff we'll promptly ignore)
              KMPStreamMatch match  = new KMPStreamMatch();
              long           result;

              /* 1. Find the beginning of the original excerpt; this will consume "<excerpt " from inputStream.
               * 2. Since we need the potential attribute list anyway, add "<excerpt " to outputStream.
               * 3. Allow the attribute list (still in inputStream) and the original excerpt to flow into outputStream.
               * 4. Since we put "<excerpt ... >" into outputStream, we may as well cap it off by letting "</excerpt>" in.
               * 5. KMP does this without asking (bizarre: leaving it as an exercise to us if we didn't want it).
               * 6. Add a newline so, upon later reassembly, the first POJO won't start just after the original excerpt element.
               */
              result = match.indexOf( inputStream, ignore, EXCERPT_START_ELEMENT );
              if( result == -1 )
              {
                final String NO_EXCERPT = "There is no opening excerpt element in this big XML";
                getLogger().error( NO_EXCERPT );
                throw new IOException( NO_EXCERPT );
              }
              outputStream.write( "  <excerpt ".getBytes() );
              result = match.indexOf( inputStream, outputStream, EXCERPT_END_ELEMENT );
              if( result == -1 )
              {
                final String NO_EXCERPT = "There is no closing excerpt element in this big XML";
                getLogger().error( NO_EXCERPT );
                throw new IOException( NO_EXCERPT );
              }
              outputStream.write( '\n' );
            }
          } );
        }
      } );

      AtomicReference< List< Pojo > > pojoListHolder = new AtomicReference<>();

      // parse the pojos into POJOs...
      session.read( pojos, new InputStreamCallback()
      {
        final List< Pojo > pojoList = pojoListHolder.get();

        @Override public void process( InputStream inputStream ) throws IOException
        {
          try
          {
            SAXParserFactory factory   = SAXParserFactory.newInstance();
            BigXmlSaxHandler handler   = new BigXmlSaxHandler();
            SAXParser        parser    = factory.newSAXParser();
            XMLReader        xmlReader = parser.getXMLReader();

            parser.parse( inputStream, handler );
            pojoListHolder.set( handler.getPojos() );
          }
          catch( ParserConfigurationException e )
          {
            throw new IOException( "ParserConfigurationException", e );
          }
          catch( SAXException e )
          {
            throw new IOException( "SAXException", e );
          }
        }
      } );

      // write out the POJOs serialized to a new flowfile...
      session.write( pojos, new OutputStreamCallback()
      {
        @Override public void process( OutputStream outputStream )
        {
          XStream      xstream  = new XStream( new StaxDriver() );
          List< Pojo > pojos    = pojoListHolder.get();
          xstream.toXML( pojos, outputStream );
        }
      } );

      excerpt = session.putAttribute( excerpt, EXCERPT_DOCUMENT, UUID );
      pojos   = session.putAttribute( pojos,   EXCERPT_DOCUMENT, UUID );
      session.transfer( excerpt, EXCERPT );
      session.transfer( pojos, POJOS );
      session.remove( flowfile );
    }
    catch( Exception e )
    {
      session.remove( excerpt );
      session.remove( pojos );
      session.transfer( flowfile, FAILURE );
    }
  }

  public static final Relationship FAILURE  = new Relationship.Builder()
      .name( "failure" )
      .description( "Original flowfile routes here upon failure." )
      .build();

  public static final Relationship EXCERPT  = new Relationship.Builder()
      .name( Constants.EXCERPT_DOCUMENT )
      .description( "A new flowfile containing the excerpt that was embedded in the big XML routes here upon success." )
      .build();
  public static final Relationship POJOS    = new Relationship.Builder()
      .name( Constants.BIGXML_POJOS )
      .description( "A new flowfile containing just the serialized POJOs from the analysis routes here upon success." )
      .build();

  private List< PropertyDescriptor > properties;
  private Set< Relationship >        relationships;

  @Override
  public void init( final ProcessorInitializationContext context )
  {
    List< PropertyDescriptor > properties = new ArrayList<>();
    this.properties = Collections.unmodifiableList( properties );

    Set< Relationship > relationships = new HashSet<>();
    relationships.add( FAILURE );
    relationships.add( EXCERPT );
    relationships.add( POJOS );
    this.relationships = Collections.unmodifiableSet( relationships );
  }

  @Override public Set< Relationship >        getRelationships()                { return relationships; }
  @Override public List< PropertyDescriptor > getSupportedPropertyDescriptors() { return properties; }
}

Notes on writing a simple NiFi custom processor

Table of Contents

Writing a custom NiFi processor

Flowfile I/O options

Transferring the flowfile

Flowfile attributes

Custom processor code

monopoly/src/main/com/windofkeltia/processor/Monopoly.java:

NiFi's JUnit test runner

monopoly/src/test/com/windofkeltia/processor/MonopolyTest.java:

Output from test

Dealing with logging messages...

monopoly/src/test/resources/logback.xml:

A second custom processor example: two flowfiles split from one

Constants.java:

ExtractBigXml.java: