| 1 |
/* |
| 2 |
* Copyright (c) 2002-2007 Dustin Sallings <dustin@spy.net> |
| 3 |
*/ |
| 4 |
|
| 5 |
#include <iostream> |
| 6 |
|
| 7 |
#include <stdio.h> |
| 8 |
#include <time.h> |
| 9 |
#include <ctype.h> |
| 10 |
#include <sys/time.h> |
| 11 |
#include <sys/types.h> |
| 12 |
|
| 13 |
#ifdef USE_ASSERT |
| 14 |
# include <assert.h> |
| 15 |
#else |
| 16 |
# undef assert |
| 17 |
# define assert(a) |
| 18 |
#endif |
| 19 |
|
| 20 |
#include <boost/regex.hpp> |
| 21 |
|
| 22 |
#include <zlib.h> |
| 23 |
|
| 24 |
#include "logfiles.h" |
| 25 |
|
| 26 |
#define NOTREACHED 0 |
| 27 |
|
| 28 |
#define AMAZON_S3_REGEX "^[0-9a-f]+ ([-A-z0-9_\\.]+) \\[(.*)\\] ([0-9\\.]+) " \ |
| 29 |
"[0-9a-f]+ [0-9A-F]+ \\S+ \\S+ (\"[^\"]*\") (\\d+) [-A-z0-9]+ ([-0-9]+) " \ |
| 30 |
"[-0-9]+ \\d+ [-0-9]+ (\"[^\"]*\") (\"[^\"]*\")" |
| 31 |
|
| 32 |
boost::regex amazon_s3_regex(AMAZON_S3_REGEX, boost::regex::perl); |
| 33 |
|
| 34 |
static bool myGzgets(struct logfile *lf) |
| 35 |
{ |
| 36 |
char *rv=lf->line; |
| 37 |
int s=LINE_BUFFER; |
| 38 |
int bytesRead=0; |
| 39 |
|
| 40 |
lf->lineLength=0; |
| 41 |
|
| 42 |
for(;;) { |
| 43 |
if(lf->gzBufCur > lf->gzBufEnd || lf->gzBufEnd == NULL) { |
| 44 |
/* Fetch some more stuff */ |
| 45 |
bytesRead=gzread(lf->input, lf->gzBuf, GZBUFFER); |
| 46 |
lf->gzBufEnd=bytesRead + lf->gzBuf - 1; |
| 47 |
/* Make sure we got something */ |
| 48 |
if(bytesRead == 0) { |
| 49 |
return(false); |
| 50 |
} |
| 51 |
lf->gzBufCur=lf->gzBuf; |
| 52 |
} |
| 53 |
/* Make sure we do not get too many characters */ |
| 54 |
if(--s > 0) { |
| 55 |
*rv++ = *lf->gzBufCur; |
| 56 |
lf->lineLength++; |
| 57 |
if(*(lf->gzBufCur++) == '\n') { |
| 58 |
*rv=0x00; |
| 59 |
return(true); |
| 60 |
} |
| 61 |
} else { |
| 62 |
*rv=0x00; |
| 63 |
return(true); |
| 64 |
} |
| 65 |
} |
| 66 |
|
| 67 |
assert(NOTREACHED); |
| 68 |
return(rv); |
| 69 |
} |
| 70 |
|
| 71 |
/* Returns a value from logTypes */ |
| 72 |
static enum logType identifyLog(const char *line) { |
| 73 |
enum logType rv=UNKNOWN; |
| 74 |
assert(line != NULL); |
| 75 |
|
| 76 |
if(boost::regex_search(line, amazon_s3_regex)) { |
| 77 |
rv=AMAZON_S3; |
| 78 |
} else { |
| 79 |
rv=COMMON; |
| 80 |
} |
| 81 |
return rv; |
| 82 |
} |
| 83 |
|
| 84 |
static void outputLineS3(struct logfile *lf) { |
| 85 |
boost::cmatch what; |
| 86 |
|
| 87 |
assert(lf); |
| 88 |
assert(lf->line); |
| 89 |
|
| 90 |
/* |
| 91 |
// Positions as defined in the regex |
| 92 |
S3_BUCKET 1 |
| 93 |
S3_DATE 2 |
| 94 |
S3_IP 3 |
| 95 |
S3_REQ 4 |
| 96 |
S3_STATUS 5 |
| 97 |
S3_SIZE 6 |
| 98 |
S3_REFER 7 |
| 99 |
S3_UA 8 |
| 100 |
*/ |
| 101 |
|
| 102 |
if(boost::regex_search(lf->line, what, amazon_s3_regex)) { |
| 103 |
std::ostream_iterator<char> out(std::cout); |
| 104 |
what.format(out, "$3 - - [$2] $4 $5 $6 $7 $8 $1\n"); |
| 105 |
} else { |
| 106 |
fprintf(stderr, "*** S3: Failed to match ``%s''\n", lf->line); |
| 107 |
} |
| 108 |
} |
| 109 |
|
| 110 |
static void outputLineDirect(struct logfile *lf) { |
| 111 |
assert(lf != NULL); |
| 112 |
assert(lf->line != NULL); |
| 113 |
fwrite(lf->line, lf->lineLength, 1, stdout); |
| 114 |
} |
| 115 |
|
| 116 |
/** |
| 117 |
* Open a logfile. |
| 118 |
*/ |
| 119 |
int openLogfile(struct logfile *lf) |
| 120 |
{ |
| 121 |
int rv=ERROR; |
| 122 |
assert(lf != NULL); |
| 123 |
|
| 124 |
assert(! lf->isOpen); |
| 125 |
|
| 126 |
fprintf(stderr, "*** Opening %s\n", lf->filename); |
| 127 |
|
| 128 |
lf->input=gzopen(lf->filename, "r"); |
| 129 |
|
| 130 |
if(lf->input != NULL) { |
| 131 |
lf->isOpen=true; |
| 132 |
rv=OK; |
| 133 |
} |
| 134 |
|
| 135 |
/* Allocate the line buffer */ |
| 136 |
lf->line=(char*)calloc(1, LINE_BUFFER); |
| 137 |
assert(lf->line != NULL); |
| 138 |
lf->lineLength=0; |
| 139 |
|
| 140 |
/* Allocate the read buffer */ |
| 141 |
lf->gzBuf=(char*)calloc(1, GZBUFFER); |
| 142 |
assert(lf->gzBuf != NULL); |
| 143 |
|
| 144 |
lf->gzBufCur=NULL; |
| 145 |
lf->gzBufEnd=NULL; |
| 146 |
|
| 147 |
return(rv); |
| 148 |
} |
| 149 |
|
| 150 |
/* A date and a string */ |
| 151 |
struct date_str { |
| 152 |
char *datestr; |
| 153 |
int val; |
| 154 |
}; |
| 155 |
|
| 156 |
#define MONTH_JAN (((((('J'<<8)|'a')<<8)|'n')<<8)|'/') |
| 157 |
#define MONTH_FEB (((((('F'<<8)|'e')<<8)|'b')<<8)|'/') |
| 158 |
#define MONTH_MAR (((((('M'<<8)|'a')<<8)|'r')<<8)|'/') |
| 159 |
#define MONTH_APR (((((('A'<<8)|'p')<<8)|'r')<<8)|'/') |
| 160 |
#define MONTH_MAY (((((('M'<<8)|'a')<<8)|'y')<<8)|'/') |
| 161 |
#define MONTH_JUN (((((('J'<<8)|'u')<<8)|'n')<<8)|'/') |
| 162 |
#define MONTH_JUL (((((('J'<<8)|'u')<<8)|'l')<<8)|'/') |
| 163 |
#define MONTH_AUG (((((('A'<<8)|'u')<<8)|'g')<<8)|'/') |
| 164 |
#define MONTH_SEP (((((('S'<<8)|'e')<<8)|'p')<<8)|'/') |
| 165 |
#define MONTH_OCT (((((('O'<<8)|'c')<<8)|'t')<<8)|'/') |
| 166 |
#define MONTH_NOV (((((('N'<<8)|'o')<<8)|'v')<<8)|'/') |
| 167 |
#define MONTH_DEC (((((('D'<<8)|'e')<<8)|'c')<<8)|'/') |
| 168 |
|
| 169 |
/* Convert a three character month to the numeric value */ |
| 170 |
TESTED_STATIC int parseMonth(const char *input) { |
| 171 |
int rv=-1; |
| 172 |
int inputInt=0; |
| 173 |
|
| 174 |
for(int i=0; i<4 && input[i]; i++) { |
| 175 |
inputInt = (inputInt << 8) | input[i]; |
| 176 |
} |
| 177 |
|
| 178 |
switch(inputInt) { |
| 179 |
case MONTH_JAN: rv=0; break; |
| 180 |
case MONTH_FEB: rv=1; break; |
| 181 |
case MONTH_MAR: rv=2; break; |
| 182 |
case MONTH_APR: rv=3; break; |
| 183 |
case MONTH_MAY: rv=4; break; |
| 184 |
case MONTH_JUN: rv=5; break; |
| 185 |
case MONTH_JUL: rv=6; break; |
| 186 |
case MONTH_AUG: rv=7; break; |
| 187 |
case MONTH_SEP: rv=8; break; |
| 188 |
case MONTH_OCT: rv=9; break; |
| 189 |
case MONTH_NOV: rv=10; break; |
| 190 |
case MONTH_DEC: rv=11; break; |
| 191 |
} |
| 192 |
|
| 193 |
return rv; |
| 194 |
} |
| 195 |
|
| 196 |
class BadTimestamp : public std::exception { |
| 197 |
virtual const char* what() const throw() { |
| 198 |
return "Timestamp parse error"; |
| 199 |
} |
| 200 |
}; |
| 201 |
|
| 202 |
static time_t parseTimestamp(struct logfile *lf) |
| 203 |
{ |
| 204 |
char *p; |
| 205 |
|
| 206 |
assert(lf != NULL); |
| 207 |
assert(lf->line != NULL); |
| 208 |
|
| 209 |
lf->timestamp=-1; |
| 210 |
|
| 211 |
p=lf->line; |
| 212 |
|
| 213 |
try { |
| 214 |
|
| 215 |
/* The shortest line I can parse is about 32 characters. */ |
| 216 |
if(lf->lineLength < 32) { |
| 217 |
/* This is a broken entry */ |
| 218 |
fprintf(stderr, "Broken log entry (too short): %s\n", p); |
| 219 |
} else if(index(p, '[') != NULL) { |
| 220 |
struct tm tm; |
| 221 |
memset(&tm, 0x00, sizeof(tm)); |
| 222 |
|
| 223 |
p=index(p, '['); |
| 224 |
/* Input validation */ |
| 225 |
if(p == NULL || lf->lineLength < 32) { |
| 226 |
fprintf(stderr, "invalid log line: %s\n", lf->line); |
| 227 |
throw BadTimestamp(); |
| 228 |
} |
| 229 |
|
| 230 |
/* fprintf(stderr, "**** Parsing %s\n", p); */ |
| 231 |
p++; |
| 232 |
tm.tm_mday=atoi(p); |
| 233 |
p+=3; |
| 234 |
tm.tm_mon=parseMonth(p); |
| 235 |
p+=4; |
| 236 |
tm.tm_year=atoi(p); |
| 237 |
p+=5; |
| 238 |
tm.tm_hour=atoi(p); |
| 239 |
p+=3; |
| 240 |
tm.tm_min=atoi(p); |
| 241 |
p+=3; |
| 242 |
tm.tm_sec=atoi(p); |
| 243 |
|
| 244 |
/* Make sure it still looks like CLF */ |
| 245 |
if(p[2] != ' ') { |
| 246 |
fprintf(stderr, |
| 247 |
"log line is starting to not look like CLF: %s\n", |
| 248 |
lf->line); |
| 249 |
throw BadTimestamp(); |
| 250 |
} |
| 251 |
|
| 252 |
tm.tm_year-=1900; |
| 253 |
|
| 254 |
/* Let mktime guess the timezone */ |
| 255 |
tm.tm_isdst=-1; |
| 256 |
|
| 257 |
lf->timestamp=mktime(&tm); |
| 258 |
|
| 259 |
} else { |
| 260 |
fprintf(stderr, "Unknown log format: %s\n", p); |
| 261 |
} |
| 262 |
|
| 263 |
} catch(BadTimestamp e) { |
| 264 |
// Damn. |
| 265 |
} |
| 266 |
|
| 267 |
if(lf->timestamp < 0) { |
| 268 |
fprintf(stderr, "* Error parsing timestamp from %s", lf->line); |
| 269 |
} |
| 270 |
|
| 271 |
return(lf->timestamp); |
| 272 |
} |
| 273 |
|
| 274 |
/** |
| 275 |
* Get the next line from a log file. |
| 276 |
* Return whether the seek actually occurred. |
| 277 |
*/ |
| 278 |
static bool nextLine(struct logfile *lf) |
| 279 |
{ |
| 280 |
bool rv=false; |
| 281 |
|
| 282 |
assert(lf != NULL); |
| 283 |
|
| 284 |
if(!lf->isOpen) { |
| 285 |
int logfileOpened=openLogfile(lf); |
| 286 |
/* This looks a little awkward, but it's the only way I can both |
| 287 |
* avoid the side effect of having assert perform the task and |
| 288 |
* not leave the variable unreferenced when assertions are off. |
| 289 |
*/ |
| 290 |
if(logfileOpened != OK) { |
| 291 |
assert(logfileOpened == OK); |
| 292 |
} |
| 293 |
/* Recurse to skip a line */ |
| 294 |
rv=nextLine(lf); |
| 295 |
assert(rv); |
| 296 |
} |
| 297 |
|
| 298 |
if(myGzgets(lf)) { |
| 299 |
rv=true; |
| 300 |
char *p=lf->line; |
| 301 |
/* Make sure the line is short enough */ |
| 302 |
assert(lf->lineLength < LINE_BUFFER); |
| 303 |
/* Make sure we read a line */ |
| 304 |
if(p[lf->lineLength-1] != '\n') { |
| 305 |
fprintf(stderr, "*** BROKEN LOG ENTRY IN %s (no newline)\n", |
| 306 |
lf->filename); |
| 307 |
rv=false; |
| 308 |
} else if(parseTimestamp(lf) == -1) { |
| 309 |
/* If we can't parse the timestamp, give up */ |
| 310 |
rv=false; |
| 311 |
} |
| 312 |
} |
| 313 |
|
| 314 |
return rv; |
| 315 |
} |
| 316 |
|
| 317 |
static void closeLogfile(struct logfile *lf) |
| 318 |
{ |
| 319 |
int gzerrno=0; |
| 320 |
|
| 321 |
assert(lf != NULL); |
| 322 |
assert(lf->input != NULL); |
| 323 |
assert(lf->filename != NULL); |
| 324 |
|
| 325 |
fprintf(stderr, "*** Closing %s\n", lf->filename); |
| 326 |
|
| 327 |
/* Free the line buffer */ |
| 328 |
if(lf->line != NULL) { |
| 329 |
free(lf->line); |
| 330 |
lf->line=NULL; |
| 331 |
} |
| 332 |
|
| 333 |
gzerrno=gzclose(lf->input); |
| 334 |
if(gzerrno!=0) { |
| 335 |
gzerror(lf->input, &gzerrno); |
| 336 |
} |
| 337 |
lf->isOpen=false; |
| 338 |
|
| 339 |
if(lf->gzBuf != NULL) { |
| 340 |
free(lf->gzBuf); |
| 341 |
lf->gzBuf = NULL; |
| 342 |
} |
| 343 |
|
| 344 |
lf->gzBufCur=NULL; |
| 345 |
lf->gzBufEnd=NULL; |
| 346 |
} |
| 347 |
|
| 348 |
/** |
| 349 |
* Get rid of a logfile that's no longer needed. |
| 350 |
*/ |
| 351 |
static void destroyLogfile(struct logfile *lf) |
| 352 |
{ |
| 353 |
assert(lf != NULL); |
| 354 |
|
| 355 |
fprintf(stderr, "** Destroying %s\n", lf->filename); |
| 356 |
|
| 357 |
if(lf->isOpen) { |
| 358 |
closeLogfile(lf); |
| 359 |
} |
| 360 |
|
| 361 |
/* Free the parts */ |
| 362 |
if(lf->filename!=NULL) { |
| 363 |
free(lf->filename); |
| 364 |
} |
| 365 |
if(lf->line != NULL) { |
| 366 |
free(lf->line); |
| 367 |
} |
| 368 |
if(lf->gzBuf != NULL) { |
| 369 |
free(lf->gzBuf); |
| 370 |
} |
| 371 |
|
| 372 |
/* Lastly, free the container itself. */ |
| 373 |
free(lf); |
| 374 |
} |
| 375 |
|
| 376 |
/** |
| 377 |
* Create a new logfile. |
| 378 |
*/ |
| 379 |
struct logfile *createLogfile(const char *filename) |
| 380 |
{ |
| 381 |
struct logfile *rv=NULL; |
| 382 |
|
| 383 |
rv=(struct logfile *)calloc(1, sizeof(struct logfile)); |
| 384 |
assert(rv != NULL); |
| 385 |
|
| 386 |
rv->filename=(char *)strdup(filename); |
| 387 |
assert(rv->filename != NULL); |
| 388 |
|
| 389 |
/* Try to open the logfile */ |
| 390 |
if(openLogfile(rv) != OK) { |
| 391 |
destroyLogfile(rv); |
| 392 |
rv=NULL; |
| 393 |
} else { |
| 394 |
/* If it's opened succesfully, read the next (first) line */ |
| 395 |
if(!nextLine(rv)) { |
| 396 |
/* If nextLine didn't return a record, this entry is invalid. */ |
| 397 |
destroyLogfile(rv); |
| 398 |
rv=NULL; |
| 399 |
} else { |
| 400 |
/* Otherwise, it's valid and we'll proceed, but close it. */ |
| 401 |
switch(identifyLog(rv->line)) { |
| 402 |
case COMMON: |
| 403 |
fprintf(stderr, "**** %s is a common log file\n", filename); |
| 404 |
rv->outputLine=outputLineDirect; |
| 405 |
break; |
| 406 |
case AMAZON_S3: |
| 407 |
fprintf(stderr, "**** %s is an s3 log file\n", filename); |
| 408 |
rv->outputLine=outputLineS3; |
| 409 |
break; |
| 410 |
case UNKNOWN: |
| 411 |
fprintf(stderr, "! Can't identify type of %s\n", filename); |
| 412 |
break; |
| 413 |
default: |
| 414 |
assert(false); |
| 415 |
} |
| 416 |
|
| 417 |
if(rv->outputLine == NULL) { |
| 418 |
destroyLogfile(rv); |
| 419 |
rv=NULL; |
| 420 |
} else { |
| 421 |
closeLogfile(rv); |
| 422 |
} |
| 423 |
} |
| 424 |
} |
| 425 |
|
| 426 |
return(rv); |
| 427 |
} |
| 428 |
|
| 429 |
/** |
| 430 |
* Get rid of the first entry in the log list, and reinsert it somewhere |
| 431 |
* that makes sense, or throw it away if it's no longer necessary. |
| 432 |
*/ |
| 433 |
void skipRecord(log_queue& queue) |
| 434 |
{ |
| 435 |
struct logfile *oldEntry=NULL; |
| 436 |
assert(!queue.empty()); |
| 437 |
|
| 438 |
oldEntry=queue.top(); |
| 439 |
queue.pop(); |
| 440 |
|
| 441 |
/* If stuff comes back, reinsert the old entry */ |
| 442 |
if(nextLine(oldEntry)) { |
| 443 |
queue.push(oldEntry); |
| 444 |
} else { |
| 445 |
destroyLogfile(oldEntry); |
| 446 |
} |
| 447 |
} |