View Javadoc
1   package org.bitbucket.jrsofty.parser.logging.util;
2   
3   import java.util.ArrayList;
4   import java.util.regex.Matcher;
5   import java.util.regex.Pattern;
6   
7   /**
8    * Helper class providing multiple ways to tokenize a String for parsing.
9    *
10   * @author jrsofty
11   *
12   */
13  public class Tokenizer {
14    /**
15     * Splits a given string into tokens based on white space. This ignores quotation marks. So in a
16     * line like
17     *
18     * <pre>
19     * The "dirty red" fish was here.
20     * </pre>
21     *
22     * <p>
23     * the tokens would be
24     * </p>
25     * <ul>
26     * <li>The</li>
27     * <li>"dirty</li>
28     * <li>red"</li>
29     * <li>fish</li>
30     * <li>was</li>
31     * <li>here.</li>
32     * </ul>
33     *
34     *
35     * @param value
36     *          The String to be tokenized.
37     * @return an array of String containing the tokens of the value.
38     */
39    public String[] simpleWhiteSpaceTokenizer(final String value) {
40      // final String[] tokens = value.split("(\\S+)");
41      if (null == value) {
42        return new String[] {};
43      }
44      final ArrayList<String> tokenList = new ArrayList<String>();
45      final Matcher regexMatcher = Pattern.compile("(\\S+)").matcher(value);
46      while (regexMatcher.find()) {
47  
48        tokenList.add(regexMatcher.group(1));
49  
50      }
51  
52      return tokenList.toArray(new String[tokenList.size()]);
53    }
54  
55    /**
56     * Splits a given string into tokens based on white space. This function recognizes quotation
57     * marks and any white space there contained is kept. So in a line like
58     *
59     * <pre>
60     * The "dirty red" fish was here.
61     * </pre>
62     * <p>
63     * the tokens would be
64     * </p>
65     * <ul>
66     * <li>The</li>
67     * <li>dirty red</li>
68     * <li>fish</li>
69     * <li>was</li>
70     * <li>here.</li>
71     * </ul>
72     *
73     * @param value
74     *          the text line that should be split into tokens.
75     * @return an array of String containing the tokens of the value.
76     */
77    public String[] quotedWhiteSpaceTokenizer(final String value) {
78      final ArrayList<String> tokenList = new ArrayList<String>();
79      final String regex = "\"([^\"]*)\"|(\\S+)";
80      final Matcher regexMatcher = Pattern.compile(regex).matcher(value);
81      while (regexMatcher.find()) {
82        if (regexMatcher.group(1) != null) {
83          tokenList.add(regexMatcher.group(1));
84        } else {
85          tokenList.add(regexMatcher.group(2));
86        }
87      }
88  
89      return tokenList.toArray(new String[tokenList.size()]);
90    }
91  
92    /**
93     * Split the value string with a custom regex pattern. This method supports multiple groups in the
94     * regex pattern.
95     *
96     * @param regexPattern
97     *          The RegEx pattern as String
98     * @param value
99     *          the value to be split by the RegEx pattern.
100    * @return an array of String containing the tokens of the value.
101    */
102   public String[] tokenizeWithPattern(final String regexPattern, final String value) {
103     final ArrayList<String> tokenList = new ArrayList<String>();
104     final Matcher regexMatcher = Pattern.compile(regexPattern).matcher(value);
105     while (regexMatcher.find()) {
106       for (int i = 1; i <= regexMatcher.groupCount(); i++) {
107         if (regexMatcher.group(i) != null) {
108           tokenList.add(regexMatcher.group(i));
109         }
110       }
111     }
112 
113     return tokenList.toArray(new String[tokenList.size()]);
114   }
115 }