Commit 42480d9588c9b0d553a3cd03d1653ca507d9ab56
- Diff rendering mode:
- inline
- side by side
failburn.py
(60 / 14)
|   | |||
| 6 | 6 | import urllib2 | |
| 7 | 7 | import time | |
| 8 | 8 | import os | |
| 9 | import cookielib | ||
| 9 | 10 | ||
| 11 | from BeautifulSoup import BeautifulSoup | ||
| 12 | |||
| 10 | 13 | # fake user-agent | |
| 11 | 14 | headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; ' \ | |
| 12 | 'rv:1.9.0.2) Gecko/2008092313 Ubuntu/8.04 (hardy) Firefox/3.1.6'} | ||
| 15 | 'rv:1.9.0.2) Gecko/2008092313 Ubuntu/8.04 (hardy) ' \ | ||
| 16 | 'Firefox/3.1.6'} | ||
| 13 | 17 | ||
| 14 | #sources = ['Inidoneas', 'Suspensas'] | ||
| 15 | sources = ['Testing'] | ||
| 18 | sources = ['Inidoneas', 'Suspensas'] | ||
| 19 | #sources = ['Testing'] | ||
| 16 | 20 | ||
| 21 | cookie_jar = cookielib.CookieJar() | ||
| 22 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar)) | ||
| 23 | |||
| 17 | 24 | for source in sources: | |
| 18 | 25 | print source + ':' | |
| 19 | 26 | content = csv.reader(open(source + '.csv'), delimiter=';') | |
| … | … | ||
| 28 | 28 | # The data have two headers | |
| 29 | 29 | content.next() | |
| 30 | 30 | content.next() | |
| 31 | |||
| 32 | 31 | for row in content: | |
| 33 | print row[0], row[1].decode('iso-8859-1'), | ||
| 32 | print row[0], row[1].decode('iso-8859-1') | ||
| 34 | 33 | ||
| 35 | 34 | # remove ".", "/" and "-". Do not translate any other characters. | |
| 36 | 35 | cnpj_clean = row[0].translate(None, './-') | |
| 37 | 36 | result_filename = cnpj_clean + '.result' | |
| 38 | 37 | ||
| 39 | if os.access(result_filename, os.F_OK): | ||
| 40 | print ' --> skipping' | ||
| 41 | continue | ||
| 42 | |||
| 43 | 38 | form = { | |
| 44 | 39 | 'acao': 'resumo', | |
| 45 | 40 | 'cdCpfCnpjDoador': cnpj_clean, | |
| … | … | ||
| 61 | 61 | ||
| 62 | 62 | response = urllib2.urlopen(request) | |
| 63 | 63 | data = response.read() | |
| 64 | cookie = response.headers['Set-Cookie'] | ||
| 64 | 65 | response.close() | |
| 65 | 66 | ||
| 66 | output = file(result_filename, 'w') | ||
| 67 | output.write(data) | ||
| 68 | output.close() | ||
| 67 | # this is probably not very effective | ||
| 68 | if '<iframe' in data: | ||
| 69 | # Yup, donators. | ||
| 70 | request = urllib2.Request(url='http://www4.tse.gov.br/' \ | ||
| 71 | '/spce2008ConsultaFinanciamento/listaReceitaCand.jsp') | ||
| 69 | 72 | ||
| 70 | print ' --> done' | ||
| 73 | for key in headers: | ||
| 74 | request.add_header(key, headers[key]) | ||
| 71 | 75 | ||
| 72 | |||
| 76 | request.add_header('Cookie', cookie) | ||
| 77 | |||
| 78 | response = urllib2.urlopen(request) | ||
| 79 | data = response.read() | ||
| 80 | response.close() | ||
| 81 | |||
| 82 | soup = BeautifulSoup(data) | ||
| 83 | rows = soup.findAll('tr') | ||
| 84 | # 0: Empty | ||
| 85 | # 1: Donator name | ||
| 86 | # 2: Empty | ||
| 87 | # 3: Donator CNPJ | ||
| 88 | # 4: Empty | ||
| 89 | # 5: Date | ||
| 90 | # 6: Empty | ||
| 91 | # 7: Donation Value | ||
| 92 | # 8: Empty | ||
| 93 | # 9: Resource type | ||
| 94 | # 10: Empty | ||
| 95 | # 11: Transfer type | ||
| 96 | # 12: Empty | ||
| 97 | # 13: Name (and, inside it, we have a lot of <br/>s) | ||
| 98 | # 14: Empty | ||
| 99 | # 15: Candidate number | ||
| 100 | # 16: Empty | ||
| 101 | # 17: Party | ||
| 102 | # 18: Empty | ||
| 103 | # 19: Position | ||
| 104 | # 20: Empty | ||
| 105 | # 21: City | ||
| 106 | # 22: Empty | ||
| 107 | |||
| 108 | for data in rows[2:]: | ||
| 109 | print '%s = %s (%s %s) %s' % ( | ||
| 110 | unicode(data.contents[7].contents[0]).strip(), | ||
| 111 | unicode(data.contents[13].contents[2]).strip(), | ||
| 112 | unicode(data.contents[15].contents[0]).strip(), | ||
| 113 | unicode(data.contents[17].contents[0]).strip(), | ||
| 114 | unicode(data.contents[21].contents[0]).strip()) | ||
| 115 | |||
| 116 | time.sleep(30) |

