dpatch is ugly and unnecessary
[freedombox-privoxy:freedombox-privoxy.git] / https_everywhere_import.py
1 #!/usr/bin/env python
2
3 """
4 Reads the xml rules files and attempts to generate equivalent action rules for privoxy.
5
6 XML interpreted according to
7 https://www.eff.org/https-everywhere/rulesets but git
8 src/chrome/content/rules has rules with additional options (grep for
9 match_rule, for example), so maybe that page is out of date.
10
11 Copyright 2012 James Vasile
12 Released under GPLv3 or later
13 """
14
15 import os, sys
16
17 rule_dir = "vendor/https-everywhere-release/chrome/content/rules"
18 if not os.path.exists(rule_dir):
19     rule_dir = "vendor/https-everywhere/src/chrome/content/rules"
20 #rule_dir = "vendor/https-everywhere/src/chrome/content/rules"
21
22 from BeautifulSoup import BeautifulSoup
23 from lxml import etree
24
25
26 class UnknownRulesetAttribute(Exception):
27     def __init__(self, args):
28         self.xml, self.element, self.key = args
29
30     def __str__(self):
31         return "%s = %s" % (self.key, self.element.attrib[self.key])
32
33 class UnknownTargetAttribute(Exception):
34     def __init__(self, args):
35         self.xml, self.element, self.key = args
36     def __str__(self):
37         return "%s = %s" % (self.key, self.element.attrib[self.key])
38
39 custom = {
40     "Crucial.com (partial)": {"^http://www\.crucial\.com/(images\d{0,2}|js|css|reviews)/":"^http://www\.crucial\.com/(images\d*|js|css|reviews)/"},
41     "Epson.com (partial)":{"^https://(www\.)?epson\.com/(([a-zA-Z]([a-zA-Z0-9])+){1})$":"^https://(www\.)?epson\.com/(([a-zA-Z]([a-zA-Z0-9])+))$"},
42     "MoveOn":{"^https?://civic\.moveon\.org/([a-z0-9]+){1}/{2,}":"^https?://civic\.moveon\.org/([a-z0-9]+)/+",
43               "^http://(?:www\.)?moveon\.org/(([^a-z0-9]+)|([a-z0-9]{2,}\?)|([a-qs-z0-9]\?)|([a-z0-9]+[^a-z0-9?]+)){1}":
44                   "^http://(?:www\.)?moveon\.org/(([^a-z0-9]+)|([a-z0-9]+\?)|([a-qs-z0-9]\?)|([a-z0-9]+[^a-z0-9?]+))",
45               "^http://(pol|civ)\.moveon\.org/([^a-z0-9]+|([a-z0-9]+[^a-z0-9]+)|$){1}":
46                   "^http://(pol|civ)\.moveon\.org/([^a-z0-9]+|([a-z0-9]+[^a-z0-9]+)|$)",
47               "^http://civic\.moveon\.org/(([^a-z0-9]+)|([a-z0-9]+[^a-z0-9/]+)|([a-z0-9]+/($|[^/]+))|$){1}":
48                   "^http://civic\.moveon\.org/(([^a-z0-9]+)|([a-z0-9]+[^a-z0-9/]+)|([a-z0-9]+/($|[^/]+))|$)"
49               },
50     "Kintera Network":{"^http://([-a-zA-Z0-9_]+\.)?([-a-zA-Z0-9_]+)\.kintera\.org/([^/]+/[^/]){1}":
51                            "^http://([-a-zA-Z0-9_]+\.)?([-a-zA-Z0-9_]+)\.kintera\.org/([^/]+/[^/])"
52         }
53
54     }
55
56 def cleanup(name, att):
57     if name in custom and att in custom[name]:
58         return custom[name][att]
59     else:
60         for c in "#@":
61             att = att.replace(c, r"\%s" % c)
62         return att
63
64 def translate_ruleset(xml):
65     def do_rule(element):
66         for k in element.attrib.keys():
67             if k == 'default_off':
68                 return
69             elif k == 'name':
70                 name = element.attrib[k]
71             elif k == "match_rule":
72                 sys.stderr.write("Warning: match_rule attribute encountered in %s\n" % name)
73             elif k == "platform":
74                 sys.stderr.write("Warning: platform rule encountered in %s\n" % name)
75             else:
76                 raise UnknownRulesetAttribute, [xml, element, k]
77
78         target = []
79         for target_element in element.iter("target"):
80             if not 'default_off' in target_element.attrib:
81                 target.append(target_element.attrib['host'])
82             for k in target_element.attrib.keys():
83                 if k != 'host' and k != 'default_off':
84                     raise UnknownTargetAttribute, target_element
85         if not target:
86             sys.stderr.write("Warning: no target for %s\n" % name)
87             return
88     
89         print "#", name.encode("UTF-8")
90         red_str = "{+redirect{"
91         for rule_element in element.iter("rule"):
92             red_str +=("s@%s@%s@" % (cleanup(name, rule_element.attrib['from']),
93                                      cleanup(name, rule_element.attrib['to']))
94                        +"\t"
95                        ).encode("UTF-8")
96         red_str = red_str.strip()
97         print"%s}}" % red_str
98         for t in target:
99             print t.encode("UTF-8")
100         print
101
102     try:
103         xml = xml.replace("rule from host", "rule from")
104         root = etree.XML(xml)
105     except:
106         print xml
107         raise
108
109     for element in root.iter("rulesetlibrary"):
110         for elem in element.iter("ruleset"):
111             do_rule(elem)
112         return
113
114     for element in root.iter("ruleset"):
115         do_rule(element)
116
117 def main(rule_dir=rule_dir):
118     default_ruleset = os.path.join(rule_dir, "default.rulesets")
119     if os.path.exists(default_ruleset):
120         with open(default_ruleset, 'r') as INF:
121             translate_ruleset(INF.read())
122     else:
123         for fname in os.listdir(rule_dir):
124             if fname.endswith('.xml'):
125                 with open(os.path.join(rule_dir, fname), 'r') as INF:
126                     translate_ruleset(INF.read())
127
128
129 if __name__ == "__main__":
130     main()