tests: Add tests for analyzer (mbox parser)
[mining-tools:mlstats.git] / pymlstats / tests / test_analyzer.py
1 # -*- coding: utf-8 -*-
2 # Copyright (C) 2013-2014 Germán Poo-Caamaño <gpoo@gnome.org>
3 #
4 # This program is free software; you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation; either version 2 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program; if not, write to the Free Software
16 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 #
18 # Authors : Germán Poo-Caamaño <gpoo@gnome.org>
19
20 import unittest
21 import os
22
23 from pymlstats import analyzer
24
25 CUR_DIR = os.path.dirname(__file__)
26 DATA_PATH = os.path.join(CUR_DIR, 'data')
27
28
29 class MailArchiveAnalyzerEncodingTest(unittest.TestCase):
30     def get_analyzer(self, path, **kwargs):
31         fname = os.path.join(DATA_PATH, path)
32         return analyzer.MailArchiveAnalyzer(filepath=fname)
33
34     def check_single_message(self, expected, messages):
35         for key, value in expected.items():
36             output = u"{}:\n" \
37                      u"\tExpected: '{}'\n" \
38                      u"\tObtained: '{}'".format(key, value, messages[0][key])
39             self.assertEqual(value, messages[0][key], output)
40
41     def test_single_message_no_encoding(self):
42         maa = self.get_analyzer('pharo-single.mbox')
43         messages, non_parsed = maa.get_messages()
44         expected = {
45             'body': u'Hi!\n\nA message in English, with a signature '
46                     u'with a different encoding.\n\nregards, G?ran'
47                     u'\n\n\n\n',
48             'content-type': None,
49             'date': '2010-12-01 14:26:40',
50             'date_tz': '3600',
51             'in-reply-to': None,
52             'list-id': None,
53             'message-id': u'<4CF64D10.9020206@domain.com>',
54             'received': None,
55             'references': None,
56             'subject': u'[List-name] Protocol Buffers anyone?',
57             'from': [(u'Göran Lastname', u'goran@domain.com')],
58             'to': None,
59             'cc': None,
60         }
61
62         self.assertEqual(1, len(messages), '# of messages')
63         self.check_single_message(expected, messages)
64         self.assertEqual(0, non_parsed, 'non_parsed')
65
66     def test_single_message_with_quoted_printable_encoding(self):
67         '''Content-Transfer-Encoding: Quoted-printable'''
68         maa = self.get_analyzer('gnome-quoted-printable.mbox')
69         messages, non_parsed = maa.get_messages()
70         expected = {  # noqa
71             'content-type': u'text/plain; charset=utf-8',
72             'date': '2008-03-17 10:35:05',
73             'date_tz': '3600',
74             'in-reply-to':
75                 u'<1205676169.6819.27.camel@user-computer.NETWORK> '
76                 u'(Simos\n\tXenitellis\'s message of "Sun\\, 16 Mar 2008 '
77                 u'14\\:02\\:49 +0000")',
78             'list-id':
79                 u'GNOME Desktop Development List '
80                 u'<desktop-devel-list.gnome.org>',
81             'message-id': u'<87iqzlofqu.fsf@avet.kvota.net>',
82             'received': None,
83             'references':
84                u'<1204225143.12769.9.camel@localhost.localdomain>\n'
85                u'\t<1204236062.14337.5.camel@localhost.localdomain>',
86             'subject': u'Re: Low memory hacks',
87             'from': [(u'Danilo Šegan', u'danilo@gnome.org')],
88             'to': [(u'Simos Xenitellis', u'simos.lists@googlemail.com')],
89             'cc': [('', u'desktop-devel-list@gnome.org'),
90                    (u'Nikolay V. Shmyrev', u'nshmyrev@yandex.ru'),
91                    (u'Brian Nitz', u'Brian.Nitz@sun.com'),
92                    (u'Bastien Nocera', u'hadess@hadess.net')],
93         }
94
95         self.assertEqual(1, len(messages), '# of messages')
96         self.check_single_message(expected, messages)
97         self.assertEqual(0, non_parsed, 'non_parsed')
98
99     def test_single_message_with_8_bit_encoding(self):
100         '''Content-Transfer-Encoding: 8bit'''
101         maa = self.get_analyzer('gnome-8-bit.mbox')
102         messages, non_parsed = maa.get_messages()
103         expected = {  # noqa
104             'body':
105                 u'El lun, 17-03-2008 a las 10:35 +0100, Danilo Šegan escribió:'
106                 u'\n> Hi Simos,\n'
107                 u'> \n\n'
108                 u'Hi,\n\n'
109                 u'[...]\n\n'
110                 u'Cheers.\n\n'
111                 u'> _______________________________________________\n'
112                 u'> desktop-devel-list mailing list\n'
113                 u'> desktop-devel-list@gnome.org\n'
114                 u'> http://mail.gnome.org/mailman/listinfo/desktop-devel-list'
115                 u'\n\n',
116             'content-type': u'text/plain; charset=utf-8',
117             'date': '2008-03-17 11:19:29',
118             'date_tz': '3600',
119             'in-reply-to': u'<87iqzlofqu.fsf@avet.kvota.net>',
120             'list-id':
121                 u'GNOME Desktop Development List '
122                 u'<desktop-devel-list.gnome.org>',
123             'message-id': u'<1205749169.7470.2.camel@aragorn>',
124             'received': None,
125             'references':
126                 u'<1204225143.12769.9.camel@localhost.localdomain>\n'
127                 u'\t<1204236062.14337.5.camel@localhost.localdomain>\n'
128                 u'\t<47C80957.6010804@sun.com> '
129                 u'<1204295876.30088.14.camel@t25>\n'
130                 u'\t<1205676169.6819.27.camel@user-computer.NETWORK>',
131             'subject': u'Re: Low memory hacks',
132             'from': [(u'Carlos Perelló Marín', u'carlos@gnome.org')],
133             'to': [(u'Danilo Šegan', u'danilo@gnome.org')],
134             'cc': [(u'Nikolay V. Shmyrev', u'nshmyrev@yandex.ru'),
135                    (u'Brian Nitz', u'Brian.Nitz@sun.com'),
136                    ('', u'desktop-devel-list@gnome.org'),
137                    (u'Danilo Šegan', u'danilo@gnome.org'),
138                    (u'Dmitry G. Mastrukov Дмитрий Геннадьевич Мастрюков',
139                     u'dmitry@taurussoft.org'),
140                    (u'Ing. Rafael de Jesús Fernández M.',
141                     u'rafael@ipicyt.edu.mx')]
142         }
143
144         self.assertEqual(1, len(messages), '# of messages')
145         self.check_single_message(expected, messages)
146         self.assertEqual(0, non_parsed, 'non_parsed')
147
148     def test_single_message_with_7_bit_encoding(self):
149         '''Content-Transfer-Encoding: 7bit'''
150         maa = self.get_analyzer('gnome-7-bit.mbox')
151         messages, non_parsed = maa.get_messages()
152         expected = {  # noqa
153             'body':
154                 u">I don't think it's fair to blame the Foundation [...]\n"
155                 u">of packaging since it's really not (just) a case [...]\n"
156                 u'>marketing.\n\n'
157                 u'No matter what is really to blame, it ultimately [...]\n\n'
158                 u'[...]\n\n'
159                 u'Rgds,\n'
160                 u'Eugenia \n',
161             'content-type':
162                 u'text/plain; format=flowed; charset="iso-8859-1";'
163                 u'\n\treply-type=original',
164             'date': '2004-09-22 02:03:40',
165             'date_tz': '-25200',
166             'in-reply-to': None,
167             'list-id':
168                 u'GNOME Desktop Development List '
169                 u'<desktop-devel-list.gnome.org>',
170             'message-id': u'<BAY12-DAV6Dhd2stb2e0000c0ce@hotmail.com>',
171             'received': None,
172             'references': None,
173             'from': [(u'Eugenia Loli-Queru', u'eloli@hotmail.com')],
174             'to': [('', u'language-bindings@gnome.org'),
175                    ('', u'desktop-devel-list@gnome.org')],
176             'cc': None
177         }
178
179         self.assertEqual(1, len(messages), '# of messages')
180         self.check_single_message(expected, messages)
181         self.assertEqual(0, non_parsed, 'non_parsed')