| 1 |
#!/usr/bin/python |
| 2 |
#-*- coding:utf-8 -*- |
| 3 |
|
| 4 |
# |
| 5 |
# This code is part of the LWN git data miner. |
| 6 |
# |
| 7 |
# Copyright 2007-9 LWN.net |
| 8 |
# Copyright 2007-9 Jonathan Corbet <corbet@lwn.net> |
| 9 |
# Copyright 2009 Germán Póo-Caamaño <gpoo@gnome.org> |
| 10 |
# |
| 11 |
# This file may be distributed under the terms of the GNU General |
| 12 |
# Public License, version 2. |
| 13 |
|
| 14 |
|
| 15 |
import database, csvdump, ConfigFile, reports |
| 16 |
import getopt, datetime |
| 17 |
import os, re, sys, rfc822, string |
| 18 |
import file_types |
| 19 |
import logparser |
| 20 |
from patterns import patterns |
| 21 |
|
| 22 |
Today = datetime.date.today() |
| 23 |
# |
| 24 |
# Control options. |
| 25 |
# |
| 26 |
MapUnknown = 0 |
| 27 |
DevReports = 1 |
| 28 |
DateStats = 0 |
| 29 |
AuthorSOBs = 1 |
| 30 |
FileFilter = None |
| 31 |
CSVFile = None |
| 32 |
CSVPrefix = None |
| 33 |
AkpmOverLt = 0 |
| 34 |
DumpDB = 0 |
| 35 |
CFName = 'gitdm.config' |
| 36 |
# |
| 37 |
# Options: |
| 38 |
# |
| 39 |
# -a Andrew Morton's signoffs shadow Linus's |
| 40 |
# -c cfile Specify a configuration file |
| 41 |
# -d Output individual developer stats |
| 42 |
# -D Output date statistics |
| 43 |
# -h hfile HTML output to hfile |
| 44 |
# -l count Maximum length for output lists |
| 45 |
# -o file File for text output |
| 46 |
# -p prefix Prefix for CSV output |
| 47 |
# -r pattern Restrict to files matching pattern |
| 48 |
# -s Ignore author SOB lines |
| 49 |
# -u Map unknown employers to '(Unknown)' |
| 50 |
# -x file.csv Export raw statistics as CSV |
| 51 |
# -z Dump out the hacker database at completion |
| 52 |
|
| 53 |
def ParseOpts (): |
| 54 |
global MapUnknown, DevReports |
| 55 |
global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB |
| 56 |
global CFName, CSVFile, CSVPrefix |
| 57 |
|
| 58 |
opts, rest = getopt.getopt (sys.argv[1:], 'adc:Dh:l:o:p:r:sux:z') |
| 59 |
for opt in opts: |
| 60 |
if opt[0] == '-a': |
| 61 |
AkpmOverLt = 1 |
| 62 |
elif opt[0] == '-c': |
| 63 |
CFName = opt[1] |
| 64 |
elif opt[0] == '-d': |
| 65 |
DevReports = 0 |
| 66 |
elif opt[0] == '-D': |
| 67 |
DateStats = 1 |
| 68 |
elif opt[0] == '-h': |
| 69 |
reports.SetHTMLOutput (open (opt[1], 'w')) |
| 70 |
elif opt[0] == '-l': |
| 71 |
reports.SetMaxList (int (opt[1])) |
| 72 |
elif opt[0] == '-o': |
| 73 |
reports.SetOutput (open (opt[1], 'w')) |
| 74 |
elif opt[0] == '-p': |
| 75 |
CSVPrefix = opt[1] |
| 76 |
elif opt[0] == '-r': |
| 77 |
print 'Filter on "%s"' % (opt[1]) |
| 78 |
FileFilter = re.compile (opt[1]) |
| 79 |
elif opt[0] == '-s': |
| 80 |
AuthorSOBs = 0 |
| 81 |
elif opt[0] == '-u': |
| 82 |
MapUnknown = 1 |
| 83 |
elif opt[0] == '-x': |
| 84 |
CSVFile = open (opt[1], 'w') |
| 85 |
print "open output file " + opt[1] + "\n" |
| 86 |
elif opt[0] == '-z': |
| 87 |
DumpDB = 1 |
| 88 |
|
| 89 |
|
| 90 |
|
| 91 |
def LookupStoreHacker (name, email): |
| 92 |
email = database.RemapEmail (email) |
| 93 |
h = database.LookupEmail (email) |
| 94 |
if h: # already there |
| 95 |
return h |
| 96 |
elist = database.LookupEmployer (email, MapUnknown) |
| 97 |
h = database.LookupName (name) |
| 98 |
if h: # new email |
| 99 |
h.addemail (email, elist) |
| 100 |
return h |
| 101 |
return database.StoreHacker(name, elist, email) |
| 102 |
|
| 103 |
# |
| 104 |
# Date tracking. |
| 105 |
# |
| 106 |
|
| 107 |
DateMap = { } |
| 108 |
|
| 109 |
def AddDateLines(date, lines): |
| 110 |
if lines > 1000000: |
| 111 |
print 'Skip big patch (%d)' % lines |
| 112 |
return |
| 113 |
try: |
| 114 |
DateMap[date] += lines |
| 115 |
except KeyError: |
| 116 |
DateMap[date] = lines |
| 117 |
|
| 118 |
def PrintDateStats(): |
| 119 |
dates = DateMap.keys () |
| 120 |
dates.sort () |
| 121 |
total = 0 |
| 122 |
datef = open ('datelc', 'w') |
| 123 |
for date in dates: |
| 124 |
total += DateMap[date] |
| 125 |
datef.write ('%d/%02d/%02d %6d %7d\n' % (date.year, date.month, date.day, |
| 126 |
DateMap[date], total)) |
| 127 |
|
| 128 |
|
| 129 |
# |
| 130 |
# Let's slowly try to move some smarts into this class. |
| 131 |
# |
| 132 |
class patch: |
| 133 |
(ADDED, REMOVED) = range (2) |
| 134 |
|
| 135 |
def __init__ (self, commit): |
| 136 |
self.commit = commit |
| 137 |
self.merge = self.added = self.removed = 0 |
| 138 |
self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net') |
| 139 |
self.email = 'unknown@hacker.net' |
| 140 |
self.sobs = [ ] |
| 141 |
self.reviews = [ ] |
| 142 |
self.testers = [ ] |
| 143 |
self.reports = [ ] |
| 144 |
self.filetypes = {} |
| 145 |
|
| 146 |
def addreviewer (self, reviewer): |
| 147 |
self.reviews.append (reviewer) |
| 148 |
|
| 149 |
def addtester (self, tester): |
| 150 |
self.testers.append (tester) |
| 151 |
|
| 152 |
def addreporter (self, reporter): |
| 153 |
self.reports.append (reporter) |
| 154 |
|
| 155 |
def addfiletype (self, filetype, added, removed): |
| 156 |
if self.filetypes.has_key (filetype): |
| 157 |
self.filetypes[filetype][self.ADDED] += added |
| 158 |
self.filetypes[filetype][self.REMOVED] += removed |
| 159 |
else: |
| 160 |
self.filetypes[filetype] = [added, removed] |
| 161 |
|
| 162 |
def parse_numstat(line, file_filter): |
| 163 |
""" |
| 164 |
Receive a line of text, determine if fits a numstat line and |
| 165 |
parse the added and removed lines as well as the file type. |
| 166 |
""" |
| 167 |
m = patterns['numstat'].match (line) |
| 168 |
if m: |
| 169 |
filename = m.group (3) |
| 170 |
# If we have a file filter, check for file lines. |
| 171 |
if file_filter and not file_filter.search (filename): |
| 172 |
return None, None, None, None |
| 173 |
|
| 174 |
try: |
| 175 |
added = int (m.group (1)) |
| 176 |
removed = int (m.group (2)) |
| 177 |
except ValueError: |
| 178 |
# A binary file (image, etc.) is marked with '-' |
| 179 |
added = removed = 0 |
| 180 |
|
| 181 |
m = patterns['rename'].match (filename) |
| 182 |
if m: |
| 183 |
filename = '%s%s%s' % (m.group (1), m.group (3), m.group (4)) |
| 184 |
|
| 185 |
filetype = file_types.guess_file_type (os.path.basename(filename)) |
| 186 |
return filename, filetype, added, removed |
| 187 |
else: |
| 188 |
return None, None, None, None |
| 189 |
|
| 190 |
# |
| 191 |
# The core hack for grabbing the information about a changeset. |
| 192 |
# |
| 193 |
def grabpatch(logpatch): |
| 194 |
global TotalAdded, TotalRemoved, TotalChanged |
| 195 |
|
| 196 |
m = patterns['commit'].match (logpatch[0]) |
| 197 |
if not m: |
| 198 |
return None |
| 199 |
|
| 200 |
p = patch(m.group (1)) |
| 201 |
|
| 202 |
for Line in logpatch[1:]: |
| 203 |
# |
| 204 |
# Maybe it's an author line? |
| 205 |
# |
| 206 |
m = patterns['author'].match (Line) |
| 207 |
if m: |
| 208 |
p.email = database.RemapEmail (m.group (2)) |
| 209 |
p.author = LookupStoreHacker(m.group (1), p.email) |
| 210 |
continue |
| 211 |
# |
| 212 |
# Could be a signed-off-by: |
| 213 |
# |
| 214 |
m = patterns['signed-off-by'].search (Line) |
| 215 |
if m: |
| 216 |
email = database.RemapEmail (m.group (2)) |
| 217 |
sobber = LookupStoreHacker(m.group (1), email) |
| 218 |
if sobber != p.author or AuthorSOBs: |
| 219 |
p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2)))) |
| 220 |
continue |
| 221 |
# |
| 222 |
# Various other tags of interest. |
| 223 |
# |
| 224 |
# Reviewed-by: |
| 225 |
m = patterns['reviewed-by'].search (Line) |
| 226 |
if m: |
| 227 |
email = database.RemapEmail (m.group (2)) |
| 228 |
p.addreviewer (LookupStoreHacker(m.group (1), email)) |
| 229 |
continue |
| 230 |
# Tested-by: |
| 231 |
m = patterns['tested-by'].search (Line) |
| 232 |
if m: |
| 233 |
email = database.RemapEmail (m.group (2)) |
| 234 |
p.addtester (LookupStoreHacker (m.group (1), email)) |
| 235 |
p.author.testcredit (patch) |
| 236 |
continue |
| 237 |
# Reported-by: |
| 238 |
m = patterns['reported-by'].search (Line) |
| 239 |
if m: |
| 240 |
email = database.RemapEmail (m.group (2)) |
| 241 |
p.addreporter (LookupStoreHacker (m.group (1), email)) |
| 242 |
p.author.reportcredit (patch) |
| 243 |
continue |
| 244 |
# Reported-and-tested-by: |
| 245 |
m = patterns['reported-and-tested-by'].search (Line) |
| 246 |
if m: |
| 247 |
email = database.RemapEmail (m.group (2)) |
| 248 |
h = LookupStoreHacker (m.group (1), email) |
| 249 |
p.addreporter (h) |
| 250 |
p.addtester (h) |
| 251 |
p.author.reportcredit (patch) |
| 252 |
p.author.testcredit (patch) |
| 253 |
continue |
| 254 |
# |
| 255 |
# If this one is a merge, make note of the fact. |
| 256 |
# |
| 257 |
m = patterns['merge'].match (Line) |
| 258 |
if m: |
| 259 |
p.merge = 1 |
| 260 |
continue |
| 261 |
# |
| 262 |
# See if it's the date. |
| 263 |
# |
| 264 |
m = patterns['date'].match (Line) |
| 265 |
if m: |
| 266 |
dt = rfc822.parsedate(m.group (2)) |
| 267 |
p.date = datetime.date (dt[0], dt[1], dt[2]) |
| 268 |
if p.date > Today: |
| 269 |
sys.stderr.write ('Funky date: %s\n' % p.date) |
| 270 |
p.date = Today |
| 271 |
continue |
| 272 |
|
| 273 |
# Get the statistics (lines added/removes) using numstats |
| 274 |
# and without requiring a diff (--numstat instead -p) |
| 275 |
(filename, filetype, added, removed) = parse_numstat (Line, FileFilter) |
| 276 |
if filename: |
| 277 |
p.added += added |
| 278 |
p.removed += removed |
| 279 |
p.addfiletype (filetype, added, removed) |
| 280 |
|
| 281 |
if '@' in p.author.name: |
| 282 |
print '%s is an author name, probably not what you want' % p.author.name |
| 283 |
|
| 284 |
# |
| 285 |
# Record some global information - but only if this patch had |
| 286 |
# stuff which wasn't ignored. This work should be done |
| 287 |
# elsewhere, |
| 288 |
# |
| 289 |
if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge: |
| 290 |
TotalAdded += p.added |
| 291 |
TotalRemoved += p.removed |
| 292 |
TotalChanged += max (p.added, p.removed) |
| 293 |
AddDateLines (p.date, max (p.added, p.removed)) |
| 294 |
empl = p.author.emailemployer (p.email, p.date) |
| 295 |
empl.AddCSet (p) |
| 296 |
if AkpmOverLt: |
| 297 |
TrimLTSOBs (p) |
| 298 |
for sobemail, sobber in p.sobs: |
| 299 |
empl = sobber.emailemployer (sobemail, p.date) |
| 300 |
empl.AddSOB() |
| 301 |
return p |
| 302 |
|
| 303 |
# |
| 304 |
# If this patch is signed off by both Andrew Morton and Linus Torvalds, |
| 305 |
# remove the (redundant) Linus signoff. |
| 306 |
# |
| 307 |
def TrimLTSOBs (p): |
| 308 |
if Linus in p.sobs and Akpm in p.sobs: |
| 309 |
p.sobs.remove (Linus) |
| 310 |
|
| 311 |
|
| 312 |
# |
| 313 |
# Here starts the real program. |
| 314 |
# |
| 315 |
ParseOpts () |
| 316 |
|
| 317 |
# |
| 318 |
# Read the config files. |
| 319 |
# |
| 320 |
ConfigFile.ConfigFile (CFName) |
| 321 |
|
| 322 |
# |
| 323 |
# Let's pre-seed the database with a couple of hackers |
| 324 |
# we want to remember. |
| 325 |
# |
| 326 |
Linus = ('torvalds@linux-foundation.org', |
| 327 |
LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org')) |
| 328 |
Akpm = ('akpm@linux-foundation.org', |
| 329 |
LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org')) |
| 330 |
|
| 331 |
TotalChanged = TotalAdded = TotalRemoved = 0 |
| 332 |
|
| 333 |
# |
| 334 |
# Snarf changesets. |
| 335 |
# |
| 336 |
print >> sys.stderr, 'Grabbing changesets...\r', |
| 337 |
|
| 338 |
patches = logparser.LogPatchSplitter(sys.stdin) |
| 339 |
printcount = CSCount = 0 |
| 340 |
|
| 341 |
for logpatch in patches: |
| 342 |
if (printcount % 50) == 0: |
| 343 |
print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount, |
| 344 |
printcount += 1 |
| 345 |
p = grabpatch(logpatch) |
| 346 |
if not p: |
| 347 |
break |
| 348 |
# if p.added > 100000 or p.removed > 100000: |
| 349 |
# print 'Skipping massive add', p.commit |
| 350 |
# continue |
| 351 |
if FileFilter and p.added == 0 and p.removed == 0: |
| 352 |
continue |
| 353 |
if not p.merge: |
| 354 |
p.author.addpatch (p) |
| 355 |
for sobemail, sob in p.sobs: |
| 356 |
sob.addsob (p) |
| 357 |
for hacker in p.reviews: |
| 358 |
hacker.addreview (p) |
| 359 |
for hacker in p.testers: |
| 360 |
hacker.addtested (p) |
| 361 |
for hacker in p.reports: |
| 362 |
hacker.addreport (p) |
| 363 |
CSCount += 1 |
| 364 |
csvdump.AccumulatePatch (p) |
| 365 |
csvdump.store_patch (p) |
| 366 |
print >> sys.stderr, 'Grabbing changesets...done ' |
| 367 |
|
| 368 |
if DumpDB: |
| 369 |
database.DumpDB () |
| 370 |
# |
| 371 |
# Say something |
| 372 |
# |
| 373 |
hlist = database.AllHackers () |
| 374 |
elist = database.AllEmployers () |
| 375 |
ndev = nempl = 0 |
| 376 |
for h in hlist: |
| 377 |
if len (h.patches) > 0: |
| 378 |
ndev += 1 |
| 379 |
for e in elist: |
| 380 |
if e.count > 0: |
| 381 |
nempl += 1 |
| 382 |
reports.Write ('Processed %d csets from %d developers\n' % (CSCount, |
| 383 |
ndev)) |
| 384 |
reports.Write ('%d employers found\n' % (nempl)) |
| 385 |
reports.Write ('A total of %d lines added, %d removed (delta %d)\n' % |
| 386 |
(TotalAdded, TotalRemoved, TotalAdded - TotalRemoved)) |
| 387 |
if TotalChanged == 0: |
| 388 |
TotalChanged = 1 # HACK to avoid div by zero |
| 389 |
if DateStats: |
| 390 |
PrintDateStats () |
| 391 |
sys.exit(0) |
| 392 |
|
| 393 |
if CSVPrefix: |
| 394 |
csvdump.save_csv (CSVPrefix) |
| 395 |
|
| 396 |
if CSVFile: |
| 397 |
csvdump.OutputCSV (CSVFile) |
| 398 |
CSVFile.close () |
| 399 |
|
| 400 |
if DevReports: |
| 401 |
reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved) |
| 402 |
reports.EmplReports (elist, TotalChanged, CSCount) |