View Javadoc
1   package org.bitbucket.jrsofty.parser.logging.util;
2   
3   import java.util.HashMap;
4   import java.util.Stack;
5   
6   import org.bitbucket.jrsofty.parser.logging.api.TokenMatcher;
7   
8   public class LogLineFormatReader {
9   
10    private static final String VALID_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
11    private static final int CMD_LEN = 3;
12    private static final int DATE_FORMAT_LEN_1 = 1;
13    private static final int DATE_FORMAT_LEN_2 = 2;
14    private static final int DATE_FORMAT_LEN_3 = 3;
15    private static final int DATE_FORMAT_LEN_4 = 4;
16  
17    private final HashMap<String, Integer> formatTokenInstanceCounter = new HashMap<String, Integer>();
18  
19    /**
20     * Creates a hash map containing the token matchers for each token in the order in which they
21     * should be expected in the log line.
22     *
23     * @param format
24     *          a String containing the log line format expected.
25     * @return a HashMap&lt;Integer, TokenMatcher&gt; containing the token matching class in the
26     *         expected order.
27     * @throws LogLineFormatException
28     *           when the given format is invalid.
29     */
30    public HashMap<Integer, TokenMatcher> createTokenMatchers(final String format)
31        throws LogLineFormatException {
32      final HashMap<Integer, TokenMatcher> map = new HashMap<Integer, TokenMatcher>();
33      int formatIndex = 0;
34      final String trimmedInput = format.trim() + " ";
35      final StringBuffer regexBuffer = new StringBuffer();
36      final Stack<Character> wrapperStack = new Stack<Character>();
37      final String openWrapper = "[({\"'`";
38      final String closeWrapper = "])}\"'`";
39      String lastCommand = "";
40      for (int i = 0; i < trimmedInput.length();) {
41        final char c = trimmedInput.charAt(i);
42        if (c == '%') {
43          i++;
44          final String cmd = trimmedInput.substring(i, i + LogLineFormatReader.CMD_LEN);
45          i += LogLineFormatReader.CMD_LEN;
46          if (cmd.equals("dtm")) {
47            final StringBuffer dtmFormatBuffer = new StringBuffer();
48            char ca = trimmedInput.charAt(i);
49            if (ca != '{') {
50              throw new LogLineFormatException("Expected '{' token not found at " + i);
51            }
52            i++;
53            while ((ca = trimmedInput.charAt(i)) != '}') {
54              dtmFormatBuffer.append(ca);
55              i++;
56              if (i >= trimmedInput.length()) {
57                throw new LogLineFormatException("Expected '}' token not found by " + i);
58              }
59            }
60            if (dtmFormatBuffer.toString().isEmpty()) {
61              throw new LogLineFormatException(
62                  "Invalid date time formatting. There must be a formatting for date and time provided.");
63            }
64            regexBuffer.append(this.createRegExForDateTime(dtmFormatBuffer.toString()));
65            i++;
66          } else if (cmd.equals("ip4")) {
67            regexBuffer.append(this.getRegExForIp4Address());
68          } else if (cmd.equals("url")) {
69            regexBuffer.append(this.getRegExForUrlMatch());
70          } else if (cmd.equals("str")) {
71            regexBuffer.append(this.getRegExForSimpleString());
72          } else if (cmd.equals("opt")) {
73            char ca = trimmedInput.charAt(i);
74            if (ca != '{') {
75              throw new LogLineFormatException("Expected '{' token not found at " + i);
76            }
77            i++;
78            final StringBuffer wordBuffer = new StringBuffer();
79            final StringBuffer listBuffer = new StringBuffer();
80  
81            while ((ca = trimmedInput.charAt(i)) != '}') {
82  
83              if (ca == ',') {
84  
85                listBuffer.append(wordBuffer.toString());
86                wordBuffer.delete(0, wordBuffer.length());
87                listBuffer.append('|');
88              } else {
89                final String strChar = String.valueOf(ca);
90                if (!LogLineFormatReader.VALID_CHARS.contains(strChar)) {
91                  final String caStr = String.valueOf(ca);
92                  throw new LogLineFormatException("'" + caStr + "' is an invalid character.");
93                }
94                wordBuffer.append(ca);
95              }
96  
97              i++;
98  
99            }
100           // Wraps up the last text elements.
101           listBuffer.append(wordBuffer.toString());
102 
103           regexBuffer.append('(' + listBuffer.toString() + ')');
104           i++;
105         } else if (cmd.equals("msg")) {
106           regexBuffer.append(this.getRegExForMessage());
107         } else if (cmd.equals("int")) {
108           regexBuffer.append(this.getRegExForIntegerMatch());
109 
110         } else {
111           throw new LogLineFormatException(
112               "The command " + cmd + " is an unsupported format token");
113         }
114         lastCommand = cmd;
115       } else if (c == ' ') {
116         map.put(formatIndex, new TokenMatcher(regexBuffer.toString(),
117             '%' + this.createMappingInstanceNumbering(lastCommand)));
118         formatIndex++;
119         regexBuffer.delete(0, regexBuffer.length());
120         i++;
121       } else {
122         final String test = String.valueOf(c);
123 
124         if (openWrapper.contains(test)) {
125           wrapperStack.push(c);
126         } else if (closeWrapper.contains(test)) {
127           final int ndx = closeWrapper.indexOf(test);
128           final char closeC = openWrapper.charAt(ndx);
129           final Character ca = wrapperStack.pop();
130           if (ca.charValue() != closeC) {
131             throw new LogLineFormatException(
132                 "Unclosed wrapper. Expecting " + closeC + " found " + ca.charValue());
133           } else {
134             // do nothing.
135           }
136         }
137         regexBuffer.append(this.escapeCharacter(c));
138         i++;
139       }
140     }
141 
142     return map;
143 
144   }
145 
146   private String createMappingInstanceNumbering(final String commandToken) {
147     String result = null;
148     if (this.formatTokenInstanceCounter.containsKey(commandToken)) {
149       Integer value = this.formatTokenInstanceCounter.get(commandToken);
150       value = value + 1;
151       result = commandToken + "[" + value + "]";
152       this.formatTokenInstanceCounter.put(commandToken, value);
153     } else {
154       this.formatTokenInstanceCounter.put(commandToken, 0);
155       result = commandToken + "[0]";
156     }
157     return result;
158   }
159 
160   private String createRegExForDateTime(final String formatInfo) throws LogLineFormatException {
161 
162     final String paddedInfo = formatInfo + " ";
163     final StringBuffer buffer = new StringBuffer("(");
164     boolean hasDateElement = false;
165     char lastChar = ' ';
166     int count = 0;
167     for (int i = 0; i < paddedInfo.length(); i++) {
168       final char c = paddedInfo.charAt(i);
169       if (!String.valueOf(c).matches("[yuYdwhHmsZM]")) {
170         if (count > 0) {
171           buffer.append(this.getRegExForDateTimeElement(lastChar, count));
172           hasDateElement = true;
173           count = 0;
174         }
175 
176         buffer.append(this.escapeCharacter(c));
177         lastChar = c;
178       } else {
179 
180         if ((count > 0) && (c != lastChar)) {
181           buffer.append(this.getRegExForDateTimeElement(lastChar, count));
182           hasDateElement = true;
183           count = 0;
184         }
185 
186         lastChar = c;
187         count++;
188       }
189 
190     }
191     if (!hasDateElement) {
192       throw new LogLineFormatException("The date format did not contain any date elements.");
193     }
194 
195     final String output = buffer.toString().trim() + ")?";
196     return output;
197   }
198 
199   private String escapeCharacter(final char c) {
200     final String reservedChars = "^!=$?*+\\[].(){}|";
201     final String testChar = String.valueOf(c);
202 
203     if (reservedChars.contains(testChar)) {
204       return "\\" + testChar;
205     }
206     return testChar;
207 
208   }
209 
210   private String getRegExForIntegerMatch() {
211     return "([\\d]+)?";
212   }
213 
214   private String getRegExForUrlMatch() {
215     return "(((http[s]*://)([\\S]+))|([-]))?";
216   }
217 
218   private String getRegExForSimpleString() {
219     return "([\\S]+)?";
220   }
221 
222   private String getRegExForMessage() {
223     return "([\\s\\S]+)?";
224   }
225 
226   private String getRegExForIp4Address() {
227     return "((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))?";
228   }
229 
230   private String getRegExForDateTimeElement(final char c, final int count)
231       throws LogLineFormatException {
232     String output = null;
233     if ((c == 'y') || (c == 'u') || (c == 'Y')) {
234       if ((count != LogLineFormatReader.DATE_FORMAT_LEN_2)
235           && (count != LogLineFormatReader.DATE_FORMAT_LEN_4)) {
236         throw new LogLineFormatException("Year formats must have two or four characters.");
237       }
238       output = "([0-9]{" + count + "})";
239     } else if ((c == 'd')) {
240       if (count == LogLineFormatReader.DATE_FORMAT_LEN_1) {
241         output = "([0-9]{1,2})";
242       } else if (count == 3) {
243         output = "([0-9]{3})";
244       } else if (count == 2) {
245         output = "([0-9]{2})";
246       } else {
247         throw new LogLineFormatException("Day formatting is either 'd' or 'dd' or 'ddd'.");
248       }
249     } else if ((c == 'w') || (c == 'h') || (c == 'H') || (c == 'm') || (c == 's')) {
250       if ((count != LogLineFormatReader.DATE_FORMAT_LEN_2)) {
251         throw new LogLineFormatException(
252             "Week, hour, minute, second elements require 2 characters");
253       }
254       output = "([0-9]{2})";
255     } else if (c == 'Z') {
256       if (count != LogLineFormatReader.DATE_FORMAT_LEN_1) {
257         throw new LogLineFormatException("No more than one time zone character");
258       }
259       output = "([0-9\\+-]{5}|[0-9\\+-]{3}:[0-9]{2})";
260     } else {
261       if (count == LogLineFormatReader.DATE_FORMAT_LEN_1) {
262         output = "([0-9]{1,2})";
263       } else if (count == LogLineFormatReader.DATE_FORMAT_LEN_2) {
264         output = "([0-9]{2})";
265       } else if (count == LogLineFormatReader.DATE_FORMAT_LEN_3) {
266         output = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)";
267       } else {
268         output = "(January|February|March|April|May|June|July|August|September|October|November|December)";
269       }
270     }
271 
272     return output;
273   }
274 
275 }