1 package org.bitbucket.jrsofty.parser.logging.util;
2
3 import java.util.HashMap;
4 import java.util.Stack;
5
6 import org.bitbucket.jrsofty.parser.logging.api.TokenMatcher;
7
8 public class LogLineFormatReader {
9
10 private static final String VALID_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
11 private static final int CMD_LEN = 3;
12 private static final int DATE_FORMAT_LEN_1 = 1;
13 private static final int DATE_FORMAT_LEN_2 = 2;
14 private static final int DATE_FORMAT_LEN_3 = 3;
15 private static final int DATE_FORMAT_LEN_4 = 4;
16
17 private final HashMap<String, Integer> formatTokenInstanceCounter = new HashMap<String, Integer>();
18
19
20
21
22
23
24
25
26
27
28
29
30 public HashMap<Integer, TokenMatcher> createTokenMatchers(final String format)
31 throws LogLineFormatException {
32 final HashMap<Integer, TokenMatcher> map = new HashMap<Integer, TokenMatcher>();
33 int formatIndex = 0;
34 final String trimmedInput = format.trim() + " ";
35 final StringBuffer regexBuffer = new StringBuffer();
36 final Stack<Character> wrapperStack = new Stack<Character>();
37 final String openWrapper = "[({\"'`";
38 final String closeWrapper = "])}\"'`";
39 String lastCommand = "";
40 for (int i = 0; i < trimmedInput.length();) {
41 final char c = trimmedInput.charAt(i);
42 if (c == '%') {
43 i++;
44 final String cmd = trimmedInput.substring(i, i + LogLineFormatReader.CMD_LEN);
45 i += LogLineFormatReader.CMD_LEN;
46 if (cmd.equals("dtm")) {
47 final StringBuffer dtmFormatBuffer = new StringBuffer();
48 char ca = trimmedInput.charAt(i);
49 if (ca != '{') {
50 throw new LogLineFormatException("Expected '{' token not found at " + i);
51 }
52 i++;
53 while ((ca = trimmedInput.charAt(i)) != '}') {
54 dtmFormatBuffer.append(ca);
55 i++;
56 if (i >= trimmedInput.length()) {
57 throw new LogLineFormatException("Expected '}' token not found by " + i);
58 }
59 }
60 if (dtmFormatBuffer.toString().isEmpty()) {
61 throw new LogLineFormatException(
62 "Invalid date time formatting. There must be a formatting for date and time provided.");
63 }
64 regexBuffer.append(this.createRegExForDateTime(dtmFormatBuffer.toString()));
65 i++;
66 } else if (cmd.equals("ip4")) {
67 regexBuffer.append(this.getRegExForIp4Address());
68 } else if (cmd.equals("url")) {
69 regexBuffer.append(this.getRegExForUrlMatch());
70 } else if (cmd.equals("str")) {
71 regexBuffer.append(this.getRegExForSimpleString());
72 } else if (cmd.equals("opt")) {
73 char ca = trimmedInput.charAt(i);
74 if (ca != '{') {
75 throw new LogLineFormatException("Expected '{' token not found at " + i);
76 }
77 i++;
78 final StringBuffer wordBuffer = new StringBuffer();
79 final StringBuffer listBuffer = new StringBuffer();
80
81 while ((ca = trimmedInput.charAt(i)) != '}') {
82
83 if (ca == ',') {
84
85 listBuffer.append(wordBuffer.toString());
86 wordBuffer.delete(0, wordBuffer.length());
87 listBuffer.append('|');
88 } else {
89 final String strChar = String.valueOf(ca);
90 if (!LogLineFormatReader.VALID_CHARS.contains(strChar)) {
91 final String caStr = String.valueOf(ca);
92 throw new LogLineFormatException("'" + caStr + "' is an invalid character.");
93 }
94 wordBuffer.append(ca);
95 }
96
97 i++;
98
99 }
100
101 listBuffer.append(wordBuffer.toString());
102
103 regexBuffer.append('(' + listBuffer.toString() + ')');
104 i++;
105 } else if (cmd.equals("msg")) {
106 regexBuffer.append(this.getRegExForMessage());
107 } else if (cmd.equals("int")) {
108 regexBuffer.append(this.getRegExForIntegerMatch());
109
110 } else {
111 throw new LogLineFormatException(
112 "The command " + cmd + " is an unsupported format token");
113 }
114 lastCommand = cmd;
115 } else if (c == ' ') {
116 map.put(formatIndex, new TokenMatcher(regexBuffer.toString(),
117 '%' + this.createMappingInstanceNumbering(lastCommand)));
118 formatIndex++;
119 regexBuffer.delete(0, regexBuffer.length());
120 i++;
121 } else {
122 final String test = String.valueOf(c);
123
124 if (openWrapper.contains(test)) {
125 wrapperStack.push(c);
126 } else if (closeWrapper.contains(test)) {
127 final int ndx = closeWrapper.indexOf(test);
128 final char closeC = openWrapper.charAt(ndx);
129 final Character ca = wrapperStack.pop();
130 if (ca.charValue() != closeC) {
131 throw new LogLineFormatException(
132 "Unclosed wrapper. Expecting " + closeC + " found " + ca.charValue());
133 } else {
134
135 }
136 }
137 regexBuffer.append(this.escapeCharacter(c));
138 i++;
139 }
140 }
141
142 return map;
143
144 }
145
146 private String createMappingInstanceNumbering(final String commandToken) {
147 String result = null;
148 if (this.formatTokenInstanceCounter.containsKey(commandToken)) {
149 Integer value = this.formatTokenInstanceCounter.get(commandToken);
150 value = value + 1;
151 result = commandToken + "[" + value + "]";
152 this.formatTokenInstanceCounter.put(commandToken, value);
153 } else {
154 this.formatTokenInstanceCounter.put(commandToken, 0);
155 result = commandToken + "[0]";
156 }
157 return result;
158 }
159
160 private String createRegExForDateTime(final String formatInfo) throws LogLineFormatException {
161
162 final String paddedInfo = formatInfo + " ";
163 final StringBuffer buffer = new StringBuffer("(");
164 boolean hasDateElement = false;
165 char lastChar = ' ';
166 int count = 0;
167 for (int i = 0; i < paddedInfo.length(); i++) {
168 final char c = paddedInfo.charAt(i);
169 if (!String.valueOf(c).matches("[yuYdwhHmsZM]")) {
170 if (count > 0) {
171 buffer.append(this.getRegExForDateTimeElement(lastChar, count));
172 hasDateElement = true;
173 count = 0;
174 }
175
176 buffer.append(this.escapeCharacter(c));
177 lastChar = c;
178 } else {
179
180 if ((count > 0) && (c != lastChar)) {
181 buffer.append(this.getRegExForDateTimeElement(lastChar, count));
182 hasDateElement = true;
183 count = 0;
184 }
185
186 lastChar = c;
187 count++;
188 }
189
190 }
191 if (!hasDateElement) {
192 throw new LogLineFormatException("The date format did not contain any date elements.");
193 }
194
195 final String output = buffer.toString().trim() + ")?";
196 return output;
197 }
198
199 private String escapeCharacter(final char c) {
200 final String reservedChars = "^!=$?*+\\[].(){}|";
201 final String testChar = String.valueOf(c);
202
203 if (reservedChars.contains(testChar)) {
204 return "\\" + testChar;
205 }
206 return testChar;
207
208 }
209
210 private String getRegExForIntegerMatch() {
211 return "([\\d]+)?";
212 }
213
214 private String getRegExForUrlMatch() {
215 return "(((http[s]*://)([\\S]+))|([-]))?";
216 }
217
218 private String getRegExForSimpleString() {
219 return "([\\S]+)?";
220 }
221
222 private String getRegExForMessage() {
223 return "([\\s\\S]+)?";
224 }
225
226 private String getRegExForIp4Address() {
227 return "((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))?";
228 }
229
230 private String getRegExForDateTimeElement(final char c, final int count)
231 throws LogLineFormatException {
232 String output = null;
233 if ((c == 'y') || (c == 'u') || (c == 'Y')) {
234 if ((count != LogLineFormatReader.DATE_FORMAT_LEN_2)
235 && (count != LogLineFormatReader.DATE_FORMAT_LEN_4)) {
236 throw new LogLineFormatException("Year formats must have two or four characters.");
237 }
238 output = "([0-9]{" + count + "})";
239 } else if ((c == 'd')) {
240 if (count == LogLineFormatReader.DATE_FORMAT_LEN_1) {
241 output = "([0-9]{1,2})";
242 } else if (count == 3) {
243 output = "([0-9]{3})";
244 } else if (count == 2) {
245 output = "([0-9]{2})";
246 } else {
247 throw new LogLineFormatException("Day formatting is either 'd' or 'dd' or 'ddd'.");
248 }
249 } else if ((c == 'w') || (c == 'h') || (c == 'H') || (c == 'm') || (c == 's')) {
250 if ((count != LogLineFormatReader.DATE_FORMAT_LEN_2)) {
251 throw new LogLineFormatException(
252 "Week, hour, minute, second elements require 2 characters");
253 }
254 output = "([0-9]{2})";
255 } else if (c == 'Z') {
256 if (count != LogLineFormatReader.DATE_FORMAT_LEN_1) {
257 throw new LogLineFormatException("No more than one time zone character");
258 }
259 output = "([0-9\\+-]{5}|[0-9\\+-]{3}:[0-9]{2})";
260 } else {
261 if (count == LogLineFormatReader.DATE_FORMAT_LEN_1) {
262 output = "([0-9]{1,2})";
263 } else if (count == LogLineFormatReader.DATE_FORMAT_LEN_2) {
264 output = "([0-9]{2})";
265 } else if (count == LogLineFormatReader.DATE_FORMAT_LEN_3) {
266 output = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)";
267 } else {
268 output = "(January|February|March|April|May|June|July|August|September|October|November|December)";
269 }
270 }
271
272 return output;
273 }
274
275 }