1
/*
2
 * Copyright (c) 2002-2007  Dustin Sallings <dustin@spy.net>
3
 */
4
5
#include <iostream>
6
7
#include <stdio.h>
8
#include <time.h>
9
#include <ctype.h>
10
#include <sys/time.h>
11
#include <sys/types.h>
12
13
#ifdef USE_ASSERT
14
# include <assert.h>
15
#else
16
# undef assert
17
# define assert(a)
18
#endif
19
20
#include <boost/regex.hpp>
21
22
#include <zlib.h>
23
24
#include "logfiles.h"
25
26
#define NOTREACHED 0
27
28
#define AMAZON_S3_REGEX "^[0-9a-f]+ ([-A-z0-9_\\.]+) \\[(.*)\\] ([0-9\\.]+) " \
29
	"[0-9a-f]+ [0-9A-F]+ \\S+ \\S+ (\"[^\"]*\") (\\d+) [-A-z0-9]+ ([-0-9]+) " \
30
	"[-0-9]+ \\d+ [-0-9]+ (\"[^\"]*\") (\"[^\"]*\")"
31
32
boost::regex amazon_s3_regex(AMAZON_S3_REGEX, boost::regex::perl);
33
34
static bool myGzgets(struct logfile *lf)
35
{
36
	char *rv=lf->line;
37
	int s=LINE_BUFFER;
38
	int bytesRead=0;
39
40
	lf->lineLength=0;
41
42
	for(;;) {
43
		if(lf->gzBufCur > lf->gzBufEnd || lf->gzBufEnd == NULL) {
44
			/* Fetch some more stuff */
45
			bytesRead=gzread(lf->input, lf->gzBuf, GZBUFFER);
46
			lf->gzBufEnd=bytesRead + lf->gzBuf - 1;
47
			/* Make sure we got something */
48
			if(bytesRead == 0) {
49
				return(false);
50
			}
51
			lf->gzBufCur=lf->gzBuf;
52
		}
53
		/* Make sure we do not get too many characters */
54
		if(--s > 0) {
55
			*rv++ = *lf->gzBufCur;
56
			lf->lineLength++;
57
			if(*(lf->gzBufCur++) == '\n') {
58
				*rv=0x00;
59
				return(true);
60
			}
61
		} else {
62
			*rv=0x00;
63
			return(true);
64
		}
65
	}
66
67
	assert(NOTREACHED);
68
	return(rv);
69
}
70
71
/* Returns a value from logTypes */
72
static enum logType identifyLog(const char *line) {
73
	enum logType rv=UNKNOWN;
74
	assert(line != NULL);
75
76
	if(boost::regex_search(line, amazon_s3_regex)) {
77
		rv=AMAZON_S3;
78
	} else {
79
		rv=COMMON;
80
	}
81
	return rv;
82
}
83
84
static void outputLineS3(struct logfile *lf) {
85
	boost::cmatch what;
86
87
	assert(lf);
88
	assert(lf->line);
89
90
/*
91
// Positions as defined in the regex
92
S3_BUCKET	1
93
S3_DATE		2
94
S3_IP		3
95
S3_REQ		4
96
S3_STATUS	5
97
S3_SIZE		6
98
S3_REFER	7
99
S3_UA		8
100
*/
101
102
	if(boost::regex_search(lf->line, what, amazon_s3_regex)) {
103
		std::ostream_iterator<char> out(std::cout);
104
		what.format(out, "$3 - - [$2] $4 $5 $6 $7 $8 $1\n");
105
	} else {
106
		fprintf(stderr, "*** S3: Failed to match ``%s''\n", lf->line);
107
	}
108
}
109
110
static void outputLineDirect(struct logfile *lf) {
111
	assert(lf != NULL);
112
	assert(lf->line != NULL);
113
	fwrite(lf->line, lf->lineLength, 1, stdout);
114
}
115
116
/**
117
 * Open a logfile.
118
 */
119
int openLogfile(struct logfile *lf)
120
{
121
	int rv=ERROR;
122
	assert(lf != NULL);
123
124
	assert(! lf->isOpen);
125
126
	fprintf(stderr, "*** Opening %s\n", lf->filename);
127
128
	lf->input=gzopen(lf->filename, "r");
129
130
	if(lf->input != NULL) {
131
		lf->isOpen=true;
132
		rv=OK;
133
	}
134
135
	/* Allocate the line buffer */
136
	lf->line=(char*)calloc(1, LINE_BUFFER);
137
	assert(lf->line != NULL);
138
	lf->lineLength=0;
139
140
	/* Allocate the read buffer */
141
	lf->gzBuf=(char*)calloc(1, GZBUFFER);
142
	assert(lf->gzBuf != NULL);
143
144
	lf->gzBufCur=NULL;
145
	lf->gzBufEnd=NULL;
146
147
	return(rv);
148
}
149
150
/* A date and a string */
151
struct date_str {
152
	char *datestr;
153
	int val;
154
};
155
156
#define MONTH_JAN       (((((('J'<<8)|'a')<<8)|'n')<<8)|'/')
157
#define MONTH_FEB       (((((('F'<<8)|'e')<<8)|'b')<<8)|'/')
158
#define MONTH_MAR       (((((('M'<<8)|'a')<<8)|'r')<<8)|'/')
159
#define MONTH_APR       (((((('A'<<8)|'p')<<8)|'r')<<8)|'/')
160
#define MONTH_MAY       (((((('M'<<8)|'a')<<8)|'y')<<8)|'/')
161
#define MONTH_JUN       (((((('J'<<8)|'u')<<8)|'n')<<8)|'/')
162
#define MONTH_JUL       (((((('J'<<8)|'u')<<8)|'l')<<8)|'/')
163
#define MONTH_AUG       (((((('A'<<8)|'u')<<8)|'g')<<8)|'/')
164
#define MONTH_SEP       (((((('S'<<8)|'e')<<8)|'p')<<8)|'/')
165
#define MONTH_OCT       (((((('O'<<8)|'c')<<8)|'t')<<8)|'/')
166
#define MONTH_NOV       (((((('N'<<8)|'o')<<8)|'v')<<8)|'/')
167
#define MONTH_DEC       (((((('D'<<8)|'e')<<8)|'c')<<8)|'/')
168
169
/* Convert a three character month to the numeric value */
170
TESTED_STATIC int parseMonth(const char *input) {
171
    int rv=-1;
172
	int inputInt=0;
173
174
	for(int i=0; i<4 && input[i]; i++) {
175
		inputInt = (inputInt << 8) | input[i];
176
	}
177
178
	switch(inputInt) {
179
		case MONTH_JAN: rv=0; break;
180
		case MONTH_FEB: rv=1; break;
181
		case MONTH_MAR: rv=2; break;
182
		case MONTH_APR: rv=3; break;
183
		case MONTH_MAY: rv=4; break;
184
		case MONTH_JUN: rv=5; break;
185
		case MONTH_JUL: rv=6; break;
186
		case MONTH_AUG: rv=7; break;
187
		case MONTH_SEP: rv=8; break;
188
		case MONTH_OCT: rv=9; break;
189
		case MONTH_NOV: rv=10; break;
190
		case MONTH_DEC: rv=11; break;
191
	}
192
193
	return rv;
194
}
195
196
class BadTimestamp : public std::exception {
197
	virtual const char* what() const throw() {
198
		return "Timestamp parse error";
199
	}
200
};
201
202
static time_t parseTimestamp(struct logfile *lf)
203
{
204
	char *p;
205
206
	assert(lf != NULL);
207
	assert(lf->line != NULL);
208
209
	lf->timestamp=-1;
210
211
	p=lf->line;
212
213
	try {
214
215
		/* The shortest line I can parse is about 32 characters. */
216
		if(lf->lineLength < 32) {
217
			/* This is a broken entry */
218
			fprintf(stderr, "Broken log entry (too short):  %s\n", p);
219
		} else if(index(p, '[') != NULL) {
220
			struct tm tm;
221
			memset(&tm, 0x00, sizeof(tm));
222
223
			p=index(p, '[');
224
			/* Input validation */
225
			if(p == NULL || lf->lineLength < 32) {
226
				fprintf(stderr, "invalid log line:  %s\n", lf->line);
227
				throw BadTimestamp();
228
			}
229
230
			/* fprintf(stderr, "**** Parsing %s\n", p); */
231
			p++;
232
			tm.tm_mday=atoi(p);
233
			p+=3;
234
			tm.tm_mon=parseMonth(p);
235
			p+=4;
236
			tm.tm_year=atoi(p);
237
			p+=5;
238
			tm.tm_hour=atoi(p);
239
			p+=3;
240
			tm.tm_min=atoi(p);
241
			p+=3;
242
			tm.tm_sec=atoi(p);
243
244
			/* Make sure it still looks like CLF */
245
			if(p[2] != ' ') {
246
				fprintf(stderr,
247
					"log line is starting to not look like CLF: %s\n",
248
					lf->line);
249
				throw BadTimestamp();
250
			}
251
252
			tm.tm_year-=1900;
253
254
			/* Let mktime guess the timezone */
255
			tm.tm_isdst=-1;
256
257
			lf->timestamp=mktime(&tm);
258
259
		} else {
260
			fprintf(stderr, "Unknown log format:  %s\n", p);
261
		}
262
263
	} catch(BadTimestamp e) {
264
		// Damn.
265
	}
266
267
	if(lf->timestamp < 0) {
268
		fprintf(stderr, "* Error parsing timestamp from %s", lf->line);
269
	}
270
271
	return(lf->timestamp);
272
}
273
274
/**
275
 * Get the next line from a log file.
276
 * Return whether the seek actually occurred.
277
 */
278
static bool nextLine(struct logfile *lf)
279
{
280
	bool rv=false;
281
282
	assert(lf != NULL);
283
284
	if(!lf->isOpen) {
285
		int logfileOpened=openLogfile(lf);
286
		/* This looks a little awkward, but it's the only way I can both
287
		 * avoid the side effect of having assert perform the task and
288
		 * not leave the variable unreferenced when assertions are off.
289
		 */
290
		if(logfileOpened != OK) {
291
			assert(logfileOpened == OK);
292
		}
293
		/* Recurse to skip a line */
294
		rv=nextLine(lf);
295
		assert(rv);
296
	}
297
298
	if(myGzgets(lf)) {
299
		rv=true;
300
		char *p=lf->line;
301
		/* Make sure the line is short enough */
302
		assert(lf->lineLength < LINE_BUFFER);
303
		/* Make sure we read a line */
304
		if(p[lf->lineLength-1] != '\n') {
305
			fprintf(stderr, "*** BROKEN LOG ENTRY IN %s (no newline)\n",
306
				lf->filename);
307
			rv=false;
308
		} else if(parseTimestamp(lf) == -1) {
309
			/* If we can't parse the timestamp, give up */
310
			rv=false;
311
		}
312
	}
313
314
	return rv;
315
}
316
317
static void closeLogfile(struct logfile *lf)
318
{
319
	int gzerrno=0;
320
321
	assert(lf != NULL);
322
	assert(lf->input != NULL);
323
	assert(lf->filename != NULL);
324
325
	fprintf(stderr, "*** Closing %s\n", lf->filename);
326
327
	/* Free the line buffer */
328
	if(lf->line != NULL) {
329
		free(lf->line);
330
		lf->line=NULL;
331
	}
332
333
	gzerrno=gzclose(lf->input);
334
	if(gzerrno!=0) {
335
		gzerror(lf->input, &gzerrno);
336
	}
337
	lf->isOpen=false;
338
339
	if(lf->gzBuf != NULL) {
340
		free(lf->gzBuf);
341
		lf->gzBuf = NULL;
342
	}
343
344
	lf->gzBufCur=NULL;
345
	lf->gzBufEnd=NULL;
346
}
347
348
/**
349
 * Get rid of a logfile that's no longer needed.
350
 */
351
static void destroyLogfile(struct logfile *lf)
352
{
353
	assert(lf != NULL);
354
355
	fprintf(stderr, "** Destroying %s\n", lf->filename);
356
357
	if(lf->isOpen) {
358
		closeLogfile(lf);
359
	}
360
361
	/* Free the parts */
362
	if(lf->filename!=NULL) {
363
		free(lf->filename);
364
	}
365
	if(lf->line != NULL) {
366
		free(lf->line);
367
	}
368
	if(lf->gzBuf != NULL) {
369
		free(lf->gzBuf);
370
	}
371
372
	/* Lastly, free the container itself. */
373
	free(lf);
374
}
375
376
/**
377
 * Create a new logfile.
378
 */
379
struct logfile *createLogfile(const char *filename)
380
{
381
	struct logfile *rv=NULL;
382
383
	rv=(struct logfile *)calloc(1, sizeof(struct logfile));
384
	assert(rv != NULL);
385
386
	rv->filename=(char *)strdup(filename);
387
	assert(rv->filename != NULL);
388
389
	/* Try to open the logfile */
390
	if(openLogfile(rv) != OK) {
391
		destroyLogfile(rv);
392
		rv=NULL;
393
	} else {
394
		/* If it's opened succesfully, read the next (first) line */
395
		if(!nextLine(rv)) {
396
			/* If nextLine didn't return a record, this entry is invalid. */
397
			destroyLogfile(rv);
398
			rv=NULL;
399
		} else {
400
			/* Otherwise, it's valid and we'll proceed, but close it. */
401
			switch(identifyLog(rv->line)) {
402
				case COMMON:
403
					fprintf(stderr, "**** %s is a common log file\n", filename);
404
					rv->outputLine=outputLineDirect;
405
					break;
406
				case AMAZON_S3:
407
					fprintf(stderr, "**** %s is an s3 log file\n", filename);
408
					rv->outputLine=outputLineS3;
409
					break;
410
				case UNKNOWN:
411
					fprintf(stderr, "! Can't identify type of %s\n", filename);
412
					break;
413
				default:
414
					assert(false);
415
			}
416
417
			if(rv->outputLine == NULL) {
418
				destroyLogfile(rv);
419
				rv=NULL;
420
			} else {
421
				closeLogfile(rv);
422
			}
423
		}
424
	}
425
426
	return(rv);
427
}
428
429
/**
430
 * Get rid of the first entry in the log list, and reinsert it somewhere
431
 * that makes sense, or throw it away if it's no longer necessary.
432
 */
433
void skipRecord(log_queue& queue)
434
{
435
	struct logfile *oldEntry=NULL;
436
	assert(!queue.empty());
437
438
	oldEntry=queue.top();
439
	queue.pop();
440
441
	/* If stuff comes back, reinsert the old entry */
442
	if(nextLine(oldEntry)) {
443
		queue.push(oldEntry);
444
	} else {
445
		destroyLogfile(oldEntry);
446
	}
447
}