Commit 5360422ba56dd333dd3a3e1a8f7589867078536c

Added routines to store git log as CSV and DB

It added routines to store a git log ouput as CSV and then
(using the CSV files) to insert the data into a database
using dbdriver.
gitdb.py
(119 / 0)
  
1#!/usr/bin/env python
2#-*- coding:utf-8 -*-
3#
4# Copyright © 2009 Germán Póo-Caamaño <gpoo@gnome.org>
5#
6# This program is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 2 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14# GNU Library General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, write to the Free Software
18# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
19"""
20 Handle the git log output and dump the changesets into
21 a CSV file.
22"""
23
24import os
25import sys
26import subprocess
27from dataloader import load_modules_from_csv, cmp_version
28from settings import settings
29import csv
30import dbdriver
31
32def gitlog2csv(src, dst_prefix, force=False):
33 config = settings['config']
34 gitdm = os.path.join(settings['gitdm'])
35 options = '-c %s -o /dev/null -p %s -s -u' % (config, dst_prefix)
36 command = '%s %s' % (gitdm, options)
37
38 if not force and os.path.isfile('%s-changesets.csv' % dst_prefix):
39 return None
40
41 if os.path.exists(src):
42 proc_cat = subprocess.Popen(['cat', src], stdout=subprocess.PIPE)
43 proc_dm = subprocess.Popen(command, shell=True, stdin=proc_cat.stdout,
44 stdout=subprocess.PIPE)
45 output = proc_dm.communicate()[0]
46 return output
47
48 return None
49
50def git2csv(releases, modules, input_path, ouput_path, force=False):
51 if not os.path.exists(ouput_path):
52 os.makedirs(ouput_path)
53
54 for (repo, pkg) in modules.iteritems():
55 print >> sys.stderr, '%20s:' % repo
56
57 log_path = os.path.join(input_path, repo)
58 versions = sorted(pkg['tags'].keys(), cmp_version, reverse=True)
59
60 for i in range(0, len(versions)):
61 cur_version = versions[i]
62
63 log_name = '%s-%s.log' % (repo, cur_version)
64 csv_name = '%s-%s' % (repo, cur_version)
65 src = os.path.join(log_path, log_name)
66 dst = os.path.join(ouput_path, csv_name)
67
68 result = gitlog2csv(src, dst, force)
69
70 if result:
71 print
72
73def load_db(releases, modules, db_name, db_user,
74 db_password='', db_host=None, csv_path=''):
75
76 # If there is not data directory, there is nothing to process
77 if not os.path.exists(csv_path):
78 return
79
80 #db = dbdriver.sqlitedb(db_name)
81 db = dbdriver.psqldb(db_name, db_user, db_password, db_host)
82 db.drop_tables()
83 db.create_tables()
84
85 for (repo, pkg) in modules.iteritems():
86 versions = sorted(pkg['tags'].keys(), cmp_version, reverse=True)
87 status = ''
88
89 for i in range(0, len(versions)):
90 version = versions[i]
91
92 changeset = '%s-%s-changesets.csv' % (repo, version)
93 changeset = os.path.join(csv_path, changeset)
94
95 filetype = '%s-%s-filetypes.csv' % (repo, version)
96 filetype = os.path.join(csv_path, filetype)
97
98 print >> sys.stderr, '%-20s:%s→%-6s\r' % (repo, status, version),
99
100 if os.path.isfile(changeset):
101 reader = csv.reader(open(changeset))
102 db.changeset2db(reader, repo, version)
103 status += (' %s' % version)
104
105 if os.path.isfile(filetype):
106 reader = csv.reader(open(filetype))
107 db.filetype2db(reader, repo, version)
108
109 print >> sys.stderr, '%-20s:%s %-6s\n' % (repo, status, 'done'),
110
111if __name__ == '__main__':
112 db_name = 'mining'
113 csv_path = os.path.join(settings['csvoutput'])
114 log_path = settings['logoutput'] # Now it is the input
115
116 releases, modules = load_modules_from_csv(settings['csvfile'])
117
118 git2csv(releases, modules, log_path, csv_path, force=False)
119 load_db(releases, modules, db_name, 'gpoo', csv_path)