1
#!/usr/bin/python
2
#-*- coding:utf-8 -*-
3
4
#
5
# This code is part of the LWN git data miner.
6
#
7
# Copyright 2007-9 LWN.net
8
# Copyright 2007-9 Jonathan Corbet <corbet@lwn.net>
9
# Copyright 2009 Germán Póo-Caamaño <gpoo@gnome.org>
10
#
11
# This file may be distributed under the terms of the GNU General
12
# Public License, version 2.
13
14
15
import database, csvdump, ConfigFile, reports
16
import getopt, datetime
17
import os, re, sys, rfc822, string
18
import file_types
19
import logparser
20
from patterns import patterns
21
22
Today = datetime.date.today()
23
#
24
# Control options.
25
#
26
MapUnknown = 0
27
DevReports = 1
28
DateStats = 0
29
AuthorSOBs = 1
30
FileFilter = None
31
CSVFile = None
32
CSVPrefix = None
33
AkpmOverLt = 0
34
DumpDB = 0
35
CFName = 'gitdm.config'
36
#
37
# Options:
38
#
39
# -a		Andrew Morton's signoffs shadow Linus's
40
# -c cfile	Specify a configuration file
41
# -d		Output individual developer stats
42
# -D		Output date statistics
43
# -h hfile	HTML output to hfile
44
# -l count	Maximum length for output lists
45
# -o file	File for text output
46
# -p prefix Prefix for CSV output
47
# -r pattern	Restrict to files matching pattern
48
# -s		Ignore author SOB lines
49
# -u		Map unknown employers to '(Unknown)'
50
# -x file.csv   Export raw statistics as CSV
51
# -z		Dump out the hacker database at completion
52
53
def ParseOpts ():
54
    global MapUnknown, DevReports
55
    global DateStats, AuthorSOBs, FileFilter, AkpmOverLt, DumpDB
56
    global CFName, CSVFile, CSVPrefix
57
58
    opts, rest = getopt.getopt (sys.argv[1:], 'adc:Dh:l:o:p:r:sux:z')
59
    for opt in opts:
60
        if opt[0] == '-a':
61
            AkpmOverLt = 1
62
        elif opt[0] == '-c':
63
            CFName = opt[1]
64
        elif opt[0] == '-d':
65
            DevReports = 0
66
        elif opt[0] == '-D':
67
            DateStats = 1
68
        elif opt[0] == '-h':
69
            reports.SetHTMLOutput (open (opt[1], 'w'))
70
        elif opt[0] == '-l':
71
            reports.SetMaxList (int (opt[1]))
72
        elif opt[0] == '-o':
73
            reports.SetOutput (open (opt[1], 'w'))
74
        elif opt[0] == '-p':
75
            CSVPrefix = opt[1]
76
        elif opt[0] == '-r':
77
            print 'Filter on "%s"' % (opt[1])
78
            FileFilter = re.compile (opt[1])
79
        elif opt[0] == '-s':
80
            AuthorSOBs = 0
81
        elif opt[0] == '-u':
82
            MapUnknown = 1
83
        elif opt[0] == '-x':
84
            CSVFile = open (opt[1], 'w')
85
            print "open output file " + opt[1] + "\n"
86
        elif opt[0] == '-z':
87
            DumpDB = 1
88
        
89
90
91
def LookupStoreHacker (name, email):
92
    email = database.RemapEmail (email)
93
    h = database.LookupEmail (email)
94
    if h: # already there
95
        return h
96
    elist = database.LookupEmployer (email, MapUnknown)
97
    h = database.LookupName (name)
98
    if h: # new email
99
        h.addemail (email, elist)
100
        return h
101
    return database.StoreHacker(name, elist, email)
102
103
#
104
# Date tracking.
105
#
106
107
DateMap = { }
108
109
def AddDateLines(date, lines):
110
    if lines > 1000000:
111
        print 'Skip big patch (%d)' % lines
112
        return
113
    try:
114
        DateMap[date] += lines
115
    except KeyError:
116
        DateMap[date] = lines
117
118
def PrintDateStats():
119
    dates = DateMap.keys ()
120
    dates.sort ()
121
    total = 0
122
    datef = open ('datelc', 'w')
123
    for date in dates:
124
        total += DateMap[date]
125
        datef.write ('%d/%02d/%02d %6d %7d\n' % (date.year, date.month, date.day,
126
                                    DateMap[date], total))
127
128
129
#
130
# Let's slowly try to move some smarts into this class.
131
#
132
class patch:
133
    (ADDED, REMOVED) = range (2)
134
135
    def __init__ (self, commit):
136
        self.commit = commit
137
        self.merge = self.added = self.removed = 0
138
        self.author = LookupStoreHacker('Unknown hacker', 'unknown@hacker.net')
139
        self.email = 'unknown@hacker.net'
140
        self.sobs = [ ]
141
        self.reviews = [ ]
142
        self.testers = [ ]
143
        self.reports = [ ]
144
        self.filetypes = {}
145
146
    def addreviewer (self, reviewer):
147
        self.reviews.append (reviewer)
148
149
    def addtester (self, tester):
150
        self.testers.append (tester)
151
152
    def addreporter (self, reporter):
153
        self.reports.append (reporter)
154
155
    def addfiletype (self, filetype, added, removed):
156
        if self.filetypes.has_key (filetype):
157
            self.filetypes[filetype][self.ADDED] += added
158
            self.filetypes[filetype][self.REMOVED] += removed
159
        else:
160
            self.filetypes[filetype] = [added, removed]
161
162
def parse_numstat(line, file_filter):
163
    """
164
        Receive a line of text, determine if fits a numstat line and
165
        parse the added and removed lines as well as the file type.
166
    """
167
    m = patterns['numstat'].match (line)
168
    if m:
169
        filename = m.group (3)
170
        # If we have a file filter, check for file lines.
171
        if file_filter and not file_filter.search (filename):
172
            return None, None, None, None
173
174
        try:
175
            added = int (m.group (1))
176
            removed = int (m.group (2))
177
        except ValueError:
178
            # A binary file (image, etc.) is marked with '-'
179
            added = removed = 0
180
181
        m = patterns['rename'].match (filename)
182
        if m:
183
            filename = '%s%s%s' % (m.group (1), m.group (3), m.group (4))
184
185
        filetype = file_types.guess_file_type (os.path.basename(filename))
186
        return filename, filetype, added, removed
187
    else:
188
        return None, None, None, None
189
190
#
191
# The core hack for grabbing the information about a changeset.
192
#
193
def grabpatch(logpatch):
194
    global TotalAdded, TotalRemoved, TotalChanged
195
    
196
    m = patterns['commit'].match (logpatch[0])
197
    if not m:
198
        return None
199
200
    p = patch(m.group (1))
201
202
    for Line in logpatch[1:]:
203
        #
204
        # Maybe it's an author line?
205
        #
206
        m = patterns['author'].match (Line)
207
        if m:
208
            p.email = database.RemapEmail (m.group (2))
209
            p.author = LookupStoreHacker(m.group (1), p.email)
210
            continue
211
        #
212
        # Could be a signed-off-by:
213
        #
214
        m = patterns['signed-off-by'].search (Line)
215
        if m:
216
            email = database.RemapEmail (m.group (2))
217
            sobber = LookupStoreHacker(m.group (1), email)
218
            if sobber != p.author or AuthorSOBs:
219
                p.sobs.append ((email, LookupStoreHacker(m.group (1), m.group (2))))
220
            continue
221
        #
222
        # Various other tags of interest.
223
        #
224
        # Reviewed-by:
225
        m = patterns['reviewed-by'].search (Line)
226
        if m:
227
            email = database.RemapEmail (m.group (2))
228
            p.addreviewer (LookupStoreHacker(m.group (1), email))
229
            continue
230
        # Tested-by:
231
        m = patterns['tested-by'].search (Line)
232
        if m:
233
            email = database.RemapEmail (m.group (2))
234
            p.addtester (LookupStoreHacker (m.group (1), email))
235
            p.author.testcredit (patch)
236
            continue
237
        # Reported-by:
238
        m = patterns['reported-by'].search (Line)
239
        if m:
240
            email = database.RemapEmail (m.group (2))
241
            p.addreporter (LookupStoreHacker (m.group (1), email))
242
            p.author.reportcredit (patch)
243
            continue
244
        # Reported-and-tested-by:
245
        m = patterns['reported-and-tested-by'].search (Line)
246
        if m:
247
            email = database.RemapEmail (m.group (2))
248
            h = LookupStoreHacker (m.group (1), email)
249
            p.addreporter (h)
250
            p.addtester (h)
251
            p.author.reportcredit (patch)
252
            p.author.testcredit (patch)
253
            continue
254
        #
255
        # If this one is a merge, make note of the fact.
256
        #
257
        m = patterns['merge'].match (Line)
258
        if m:
259
            p.merge = 1
260
            continue
261
        #
262
        # See if it's the date.
263
        #
264
        m = patterns['date'].match (Line)
265
        if m:
266
            dt = rfc822.parsedate(m.group (2))
267
            p.date = datetime.date (dt[0], dt[1], dt[2])
268
            if p.date > Today:
269
                sys.stderr.write ('Funky date: %s\n' % p.date)
270
                p.date = Today
271
            continue
272
273
        # Get the statistics (lines added/removes) using numstats
274
        # and without requiring a diff (--numstat instead -p)
275
        (filename, filetype, added, removed) = parse_numstat (Line, FileFilter)
276
        if filename:
277
            p.added += added
278
            p.removed += removed
279
            p.addfiletype (filetype, added, removed)
280
281
    if '@' in p.author.name:
282
        print '%s is an author name, probably not what you want' % p.author.name
283
284
    #
285
    # Record some global information - but only if this patch had
286
    # stuff which wasn't ignored.  This work should be done
287
    # elsewhere,
288
    #
289
    if ((p.added + p.removed) > 0 or not FileFilter) and not p.merge:
290
        TotalAdded += p.added
291
        TotalRemoved += p.removed
292
        TotalChanged += max (p.added, p.removed)
293
        AddDateLines (p.date, max (p.added, p.removed))
294
        empl = p.author.emailemployer (p.email, p.date)
295
        empl.AddCSet (p)
296
        if AkpmOverLt:
297
            TrimLTSOBs (p)
298
        for sobemail, sobber in p.sobs:
299
            empl = sobber.emailemployer (sobemail, p.date)
300
            empl.AddSOB()
301
    return p
302
303
#
304
# If this patch is signed off by both Andrew Morton and Linus Torvalds,
305
# remove the (redundant) Linus signoff.
306
#
307
def TrimLTSOBs (p):
308
    if Linus in p.sobs and Akpm in p.sobs:
309
        p.sobs.remove (Linus)
310
311
312
#
313
# Here starts the real program.
314
#
315
ParseOpts ()
316
317
#
318
# Read the config files.
319
#
320
ConfigFile.ConfigFile (CFName)
321
322
#
323
# Let's pre-seed the database with a couple of hackers
324
# we want to remember.
325
#
326
Linus = ('torvalds@linux-foundation.org',
327
         LookupStoreHacker ('Linus Torvalds', 'torvalds@linux-foundation.org'))
328
Akpm = ('akpm@linux-foundation.org',
329
        LookupStoreHacker ('Andrew Morton', 'akpm@linux-foundation.org'))
330
331
TotalChanged = TotalAdded = TotalRemoved = 0
332
333
#
334
# Snarf changesets.
335
#
336
print >> sys.stderr, 'Grabbing changesets...\r',
337
338
patches = logparser.LogPatchSplitter(sys.stdin)
339
printcount = CSCount = 0
340
341
for logpatch in patches:
342
    if (printcount % 50) == 0:
343
        print >> sys.stderr, 'Grabbing changesets...%d\r' % printcount,
344
    printcount += 1
345
    p = grabpatch(logpatch)
346
    if not p:
347
        break
348
#    if p.added > 100000 or p.removed > 100000:
349
#        print 'Skipping massive add', p.commit
350
#        continue
351
    if FileFilter and p.added == 0 and p.removed == 0:
352
        continue
353
    if not p.merge:
354
        p.author.addpatch (p)
355
        for sobemail, sob in p.sobs:
356
            sob.addsob (p)
357
        for hacker in p.reviews:
358
            hacker.addreview (p)
359
        for hacker in p.testers:
360
            hacker.addtested (p)
361
        for hacker in p.reports:
362
            hacker.addreport (p)
363
        CSCount += 1
364
    csvdump.AccumulatePatch (p)
365
    csvdump.store_patch (p)
366
print >> sys.stderr, 'Grabbing changesets...done       '
367
368
if DumpDB:
369
    database.DumpDB ()
370
#
371
# Say something
372
#
373
hlist = database.AllHackers ()
374
elist = database.AllEmployers ()
375
ndev = nempl = 0
376
for h in hlist:
377
    if len (h.patches) > 0:
378
        ndev += 1
379
for e in elist:
380
    if e.count > 0:
381
        nempl += 1
382
reports.Write ('Processed %d csets from %d developers\n' % (CSCount,
383
                                                            ndev))
384
reports.Write ('%d employers found\n' % (nempl))
385
reports.Write ('A total of %d lines added, %d removed (delta %d)\n' %
386
               (TotalAdded, TotalRemoved, TotalAdded - TotalRemoved))
387
if TotalChanged == 0:
388
    TotalChanged = 1 # HACK to avoid div by zero
389
if DateStats:
390
    PrintDateStats ()
391
    sys.exit(0)
392
393
if CSVPrefix:
394
    csvdump.save_csv (CSVPrefix)
395
396
if CSVFile:
397
    csvdump.OutputCSV (CSVFile)
398
    CSVFile.close ()
399
400
if DevReports:
401
    reports.DevReports (hlist, TotalChanged, CSCount, TotalRemoved)
402
reports.EmplReports (elist, TotalChanged, CSCount)