Tokenizer

package org.bitbucket.jrsofty.parser.logging.util;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Helper class providing multiple ways to tokenize a String for parsing.
 *
 * @author jrsofty
 *
 */
public class Tokenizer {
  /**
   * Splits a given string into tokens based on white space. This ignores quotation marks. So in a
   * line like
   *
   * <pre>
   * The "dirty red" fish was here.
   * </pre>
   *
   * <p>
   * the tokens would be
   * </p>
   * <ul>
   * <li>The</li>
   * <li>"dirty</li>
   * <li>red"</li>
   * <li>fish</li>
   * <li>was</li>
   * <li>here.</li>
   * </ul>
   *
   *
   * @param value
   *          The String to be tokenized.
   * @return an array of String containing the tokens of the value.
   */
  public String[] simpleWhiteSpaceTokenizer(final String value) {
    // final String[] tokens = value.split("(\\S+)");
    if (null == value) {
      return new String[] {};
    }
    final ArrayList<String> tokenList = new ArrayList<String>();
    final Matcher regexMatcher = Pattern.compile("(\\S+)").matcher(value);
    while (regexMatcher.find()) {

      tokenList.add(regexMatcher.group(1));

    }

    return tokenList.toArray(new String[tokenList.size()]);
  }

  /**
   * Splits a given string into tokens based on white space. This function recognizes quotation
   * marks and any white space there contained is kept. So in a line like
   *
   * <pre>
   * The "dirty red" fish was here.
   * </pre>
   * <p>
   * the tokens would be
   * </p>
   * <ul>
   * <li>The</li>
   * <li>dirty red</li>
   * <li>fish</li>
   * <li>was</li>
   * <li>here.</li>
   * </ul>
   *
   * @param value
   *          the text line that should be split into tokens.
   * @return an array of String containing the tokens of the value.
   */
  public String[] quotedWhiteSpaceTokenizer(final String value) {
    final ArrayList<String> tokenList = new ArrayList<String>();
    final String regex = "\"([^\"]*)\"|(\\S+)";
    final Matcher regexMatcher = Pattern.compile(regex).matcher(value);
    while (regexMatcher.find()) {
      if (regexMatcher.group(1) != null) {
        tokenList.add(regexMatcher.group(1));
      } else {
        tokenList.add(regexMatcher.group(2));
      }
    }

    return tokenList.toArray(new String[tokenList.size()]);
  }

  /**
   * Split the value string with a custom regex pattern. This method supports multiple groups in the
   * regex pattern.
   *
   * @param regexPattern
   *          The RegEx pattern as String
   * @param value
   *          the value to be split by the RegEx pattern.
   * @return an array of String containing the tokens of the value.
   */
  public String[] tokenizeWithPattern(final String regexPattern, final String value) {
    final ArrayList<String> tokenList = new ArrayList<String>();
    final Matcher regexMatcher = Pattern.compile(regexPattern).matcher(value);
    while (regexMatcher.find()) {
      for (int i = 1; i <= regexMatcher.groupCount(); i++) {
        if (regexMatcher.group(i) != null) {
          tokenList.add(regexMatcher.group(i));
        }
      }
    }

    return tokenList.toArray(new String[tokenList.size()]);
  }
}