Commit 42480d9588c9b0d553a3cd03d1653ca507d9ab56

Fix for detection
failburn.py
(60 / 14)
  
66import urllib2
77import time
88import os
9import cookielib
910
11from BeautifulSoup import BeautifulSoup
12
1013# fake user-agent
1114headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; ' \
12 'rv:1.9.0.2) Gecko/2008092313 Ubuntu/8.04 (hardy) Firefox/3.1.6'}
15 'rv:1.9.0.2) Gecko/2008092313 Ubuntu/8.04 (hardy) ' \
16 'Firefox/3.1.6'}
1317
14#sources = ['Inidoneas', 'Suspensas']
15sources = ['Testing']
18sources = ['Inidoneas', 'Suspensas']
19#sources = ['Testing']
1620
21cookie_jar = cookielib.CookieJar()
22opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
23
1724for source in sources:
1825 print source + ':'
1926 content = csv.reader(open(source + '.csv'), delimiter=';')
2828 # The data have two headers
2929 content.next()
3030 content.next()
31
3231 for row in content:
33 print row[0], row[1].decode('iso-8859-1'),
32 print row[0], row[1].decode('iso-8859-1')
3433
3534 # remove ".", "/" and "-". Do not translate any other characters.
3635 cnpj_clean = row[0].translate(None, './-')
3736 result_filename = cnpj_clean + '.result'
3837
39 if os.access(result_filename, os.F_OK):
40 print ' --> skipping'
41 continue
42
4338 form = {
4439 'acao': 'resumo',
4540 'cdCpfCnpjDoador': cnpj_clean,
6161
6262 response = urllib2.urlopen(request)
6363 data = response.read()
64 cookie = response.headers['Set-Cookie']
6465 response.close()
6566
66 output = file(result_filename, 'w')
67 output.write(data)
68 output.close()
67 # this is probably not very effective
68 if '<iframe' in data:
69 # Yup, donators.
70 request = urllib2.Request(url='http://www4.tse.gov.br/' \
71 '/spce2008ConsultaFinanciamento/listaReceitaCand.jsp')
6972
70 print ' --> done'
73 for key in headers:
74 request.add_header(key, headers[key])
7175
72 print
76 request.add_header('Cookie', cookie)
77
78 response = urllib2.urlopen(request)
79 data = response.read()
80 response.close()
81
82 soup = BeautifulSoup(data)
83 rows = soup.findAll('tr')
84 # 0: Empty
85 # 1: Donator name
86 # 2: Empty
87 # 3: Donator CNPJ
88 # 4: Empty
89 # 5: Date
90 # 6: Empty
91 # 7: Donation Value
92 # 8: Empty
93 # 9: Resource type
94 # 10: Empty
95 # 11: Transfer type
96 # 12: Empty
97 # 13: Name (and, inside it, we have a lot of <br/>s)
98 # 14: Empty
99 # 15: Candidate number
100 # 16: Empty
101 # 17: Party
102 # 18: Empty
103 # 19: Position
104 # 20: Empty
105 # 21: City
106 # 22: Empty
107
108 for data in rows[2:]:
109 print '%s = %s (%s %s) %s' % (
110 unicode(data.contents[7].contents[0]).strip(),
111 unicode(data.contents[13].contents[2]).strip(),
112 unicode(data.contents[15].contents[0]).strip(),
113 unicode(data.contents[17].contents[0]).strip(),
114 unicode(data.contents[21].contents[0]).strip())
115 print
116 time.sleep(30)