use release version of https-everywhere
[freedombox-privoxy:freedombox-privoxy.git] / https_everywhere_import.py
1 #!/usr/bin/env python
2
3 """
4 Reads the xml rules files and attempts to generate equivalent action rules for privoxy.
5
6 XML interpreted according to
7 https://www.eff.org/https-everywhere/rulesets but git
8 src/chrome/content/rules has rules with additional options (grep for
9 match_rule, for example), so maybe that page is out of date.
10
11 Copyright 2012 James Vasile
12 Released under GPLv3 or later
13 """
14
15 import os, sys
16
17 rule_dir = "vendor/https-everywhere-release/chrome/content/rules"
18 if not os.path.exists(rule_dir):
19     rule_dir = "vendor/https-everywhere/src/chrome/content/rules"
20
21 from BeautifulSoup import BeautifulSoup
22 from lxml import etree
23
24
25 class UnknownRulesetAttribute(Exception):
26     def __init__(self, args):
27         self.xml, self.element, self.key = args
28
29     def __str__(self):
30         return "%s = %s" % (self.key, self.element.attrib[self.key])
31
32 class UnknownTargetAttribute(Exception):
33     def __init__(self, args):
34         self.xml, self.element, self.key = args
35     def __str__(self):
36         return "%s = %s" % (self.key, self.element.attrib[self.key])
37
38 custom = {
39     "Crucial.com (partial)": {"^http://www\.crucial\.com/(images\d{0,2}|js|css|reviews)/":"^http://www\.crucial\.com/(images\d*|js|css|reviews)/"},
40     "Epson.com (partial)":{"^https://(www\.)?epson\.com/(([a-zA-Z]([a-zA-Z0-9])+){1})$":"^https://(www\.)?epson\.com/(([a-zA-Z]([a-zA-Z0-9])+))$"},
41     "MoveOn":{"^https?://civic\.moveon\.org/([a-z0-9]+){1}/{2,}":"^https?://civic\.moveon\.org/([a-z0-9]+)/+",
42               "^http://(?:www\.)?moveon\.org/(([^a-z0-9]+)|([a-z0-9]{2,}\?)|([a-qs-z0-9]\?)|([a-z0-9]+[^a-z0-9?]+)){1}":
43                   "^http://(?:www\.)?moveon\.org/(([^a-z0-9]+)|([a-z0-9]+\?)|([a-qs-z0-9]\?)|([a-z0-9]+[^a-z0-9?]+))",
44               "^http://(pol|civ)\.moveon\.org/([^a-z0-9]+|([a-z0-9]+[^a-z0-9]+)|$){1}":
45                   "^http://(pol|civ)\.moveon\.org/([^a-z0-9]+|([a-z0-9]+[^a-z0-9]+)|$)",
46               "^http://civic\.moveon\.org/(([^a-z0-9]+)|([a-z0-9]+[^a-z0-9/]+)|([a-z0-9]+/($|[^/]+))|$){1}":
47                   "^http://civic\.moveon\.org/(([^a-z0-9]+)|([a-z0-9]+[^a-z0-9/]+)|([a-z0-9]+/($|[^/]+))|$)"
48               },
49     "Kintera Network":{"^http://([-a-zA-Z0-9_]+\.)?([-a-zA-Z0-9_]+)\.kintera\.org/([^/]+/[^/]){1}":
50                            "^http://([-a-zA-Z0-9_]+\.)?([-a-zA-Z0-9_]+)\.kintera\.org/([^/]+/[^/])"
51         }
52
53     }
54
55 def cleanup(name, att):
56     if name in custom and att in custom[name]:
57         return custom[name][att]
58     else:
59         for c in "#@":
60             att = att.replace(c, r"\%s" % c)
61         return att
62
63 def translate_ruleset(xml):
64     def do_rule(element):
65         for k in element.attrib.keys():
66             if k == 'default_off':
67                 return
68             elif k == 'name':
69                 name = element.attrib[k]
70             elif k == "match_rule":
71                 sys.stderr.write("Warning: match_rule attribute encountered in %s\n" % name)
72             elif k == "platform":
73                 sys.stderr.write("Warning: platform rule encountered in %s\n" % name)
74             else:
75                 raise UnknownRulesetAttribute, [xml, element, k]
76
77         target = []
78         for element in element.iter("target"):
79             if not 'default_off' in element.attrib:
80                 target.append(element.attrib['host'])
81             for k in element.attrib.keys():
82                 if k != 'host' and k != 'default_off':
83                     raise UnknownTargetAttribute, element
84         if not target:
85             sys.stderr.write("Warning: no target for %s\n" % name)
86             return
87     
88         print "#", name.encode("UTF-8")
89         red_str = "{+redirect{"
90         for element in element.iter("rule"):
91             red_str +=("s@%s@%s@" % (cleanup(name, element.attrib['from']),
92                                      cleanup(name, element.attrib['to']))
93                        +"\t"
94                        ).encode("UTF-8")
95         red_str = red_str.strip()
96         print"%s}}" % red_str
97         for t in target:
98             print t.encode("UTF-8")
99         print
100
101     try:
102         xml = xml.replace("rule from host", "rule from")
103         root = etree.XML(xml)
104     except:
105         print xml
106         raise
107
108     for element in root.iter("rulesetlibrary"):
109         for elem in element.iter("ruleset"):
110             do_rule(elem)
111         return
112
113     for element in root.iter("ruleset"):
114         do_rule(element)
115
116 def main(rule_dir=rule_dir):
117     default_ruleset = os.path.join(rule_dir, "default.rulesets")
118     if os.path.exists(default_ruleset):
119         with open(default_ruleset, 'r') as INF:
120             translate_ruleset(INF.read())
121     else:
122         for fname in os.listdir(rule_dir):
123             if fname.endswith('.xml'):
124                 with open(os.path.join(rule_dir, fname), 'r') as INF:
125                     translate_ruleset(INF.read())
126
127
128 if __name__ == "__main__":
129     main()