1
"""
2
PRE-PROCESSORS
3
=============================================================================
4
5
Preprocessors work on source text before we start doing anything too
6
complicated. 
7
"""
8
9
import re
10
import util
11
import odict
12
13
14
def build_preprocessors(md_instance, **kwargs):
15
    """ Build the default set of preprocessors used by Markdown. """
16
    preprocessors = odict.OrderedDict()
17
    preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance)
18
    preprocessors["reference"] = ReferencePreprocessor(md_instance)
19
    return preprocessors
20
21
22
class Preprocessor(util.Processor):
23
    """
24
    Preprocessors are run after the text is broken into lines.
25
26
    Each preprocessor implements a "run" method that takes a pointer to a
27
    list of lines of the document, modifies it as necessary and returns
28
    either the same pointer or a pointer to a new list.
29
30
    Preprocessors must extend markdown.Preprocessor.
31
32
    """
33
    def run(self, lines):
34
        """
35
        Each subclass of Preprocessor should override the `run` method, which
36
        takes the document as a list of strings split by newlines and returns
37
        the (possibly modified) list of lines.
38
39
        """
40
        pass
41
42
43
class HtmlBlockPreprocessor(Preprocessor):
44
    """Remove html blocks from the text and store them for later retrieval."""
45
46
    right_tag_patterns = ["</%s>", "%s>"]
47
    attrs_pattern = r"""
48
        \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
49
        |                                                         # OR 
50
        \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+)               # attr=value
51
        |                                                         # OR
52
        \s+(?P<attr2>[^>"'/= ]+)                                  # attr
53
        """
54
    left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
55
    attrs_re = re.compile(attrs_pattern, re.VERBOSE)
56
    left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
57
    markdown_in_raw = False
58
59
    def _get_left_tag(self, block):
60
        m = self.left_tag_re.match(block)
61
        if m:
62
            tag = m.group('tag')
63
            raw_attrs = m.group('attrs')
64
            attrs = {}
65
            if raw_attrs:
66
                for ma in self.attrs_re.finditer(raw_attrs):
67
                    if ma.group('attr'):
68
                        if ma.group('value'):
69
                            attrs[ma.group('attr').strip()] = ma.group('value')
70
                        else:
71
                            attrs[ma.group('attr').strip()] = ""
72
                    elif ma.group('attr1'):
73
                        if ma.group('value1'):
74
                            attrs[ma.group('attr1').strip()] = ma.group('value1')
75
                        else:
76
                            attrs[ma.group('attr1').strip()] = ""
77
                    elif ma.group('attr2'):
78
                        attrs[ma.group('attr2').strip()] = ""
79
            return tag, len(m.group(0)), attrs
80
        else:
81
            tag = block[1:].replace(">", " ", 1).split()[0].lower()
82
            return tag, len(tag)+2, {}
83
84
    def _recursive_tagfind(self, ltag, rtag, start_index, block):
85
        while 1:
86
            i = block.find(rtag, start_index)
87
            if i == -1:
88
                return -1
89
            j = block.find(ltag, start_index) 
90
            # if no ltag, or rtag found before another ltag, return index
91
            if (j > i or j == -1):
92
                return i + len(rtag)
93
            # another ltag found before rtag, use end of ltag as starting
94
            # point and search again
95
            j = block.find('>', j)
96
            start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
97
            if start_index == -1:
98
                # HTML potentially malformed- ltag has no corresponding 
99
                # rtag
100
                return -1
101
102
    def _get_right_tag(self, left_tag, left_index, block):
103
        for p in self.right_tag_patterns:
104
            tag = p % left_tag
105
            i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block)
106
            if i > 2:
107
                return tag.lstrip("<").rstrip(">"), i
108
        return block.rstrip()[-left_index:-1].lower(), len(block)
109
    
110
    def _equal_tags(self, left_tag, right_tag):
111
        if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
112
            return True
113
        if ("/" + left_tag) == right_tag:
114
            return True
115
        if (right_tag == "--" and left_tag == "--"):
116
            return True
117
        elif left_tag == right_tag[1:] \
118
            and right_tag[0] != "<":
119
            return True
120
        else:
121
            return False
122
123
    def _is_oneliner(self, tag):
124
        return (tag in ['hr', 'hr/'])
125
126
    def run(self, lines):
127
        text = "\n".join(lines)
128
        new_blocks = []
129
        text = text.split("\n\n")
130
        items = []
131
        left_tag = ''
132
        right_tag = ''
133
        in_tag = False # flag
134
135
        while text:
136
            block = text[0]
137
            if block.startswith("\n"):
138
                block = block[1:]
139
            text = text[1:]
140
141
            if block.startswith("\n"):
142
                block = block[1:]
143
144
            if not in_tag:
145
                if block.startswith("<") and len(block.strip()) > 1:
146
                    left_tag, left_index, attrs = self._get_left_tag(block)
147
                    right_tag, data_index = self._get_right_tag(left_tag, 
148
                                                                left_index,
149
                                                                block)
150
151
                    if block[1] == "!":
152
                        # is a comment block
153
                        left_tag = "--"
154
                        right_tag, data_index = self._get_right_tag(left_tag, 
155
                                                                    left_index,
156
                                                                    block)
157
                        # keep checking conditions below and maybe just append
158
                    
159
                    if data_index < len(block) \
160
                        and util.isBlockLevel(left_tag): 
161
                        text.insert(0, block[data_index:])
162
                        block = block[:data_index]
163
164
                    if not (util.isBlockLevel(left_tag) \
165
                        or block[1] in ["!", "?", "@", "%"]):
166
                        new_blocks.append(block)
167
                        continue
168
169
                    if self._is_oneliner(left_tag):
170
                        new_blocks.append(block.strip())
171
                        continue
172
173
                    if block.rstrip().endswith(">") \
174
                        and self._equal_tags(left_tag, right_tag):
175
                        if self.markdown_in_raw and 'markdown' in attrs.keys():
176
                            start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
177
                                           '', block[:left_index])
178
                            end = block[-len(right_tag)-2:]
179
                            block = block[left_index:-len(right_tag)-2]
180
                            new_blocks.append(
181
                                self.markdown.htmlStash.store(start))
182
                            new_blocks.append(block)
183
                            new_blocks.append(
184
                                self.markdown.htmlStash.store(end))
185
                        else:
186
                            new_blocks.append(
187
                                self.markdown.htmlStash.store(block.strip()))
188
                        continue
189
                    else: 
190
                        # if is block level tag and is not complete
191
192
                        if util.isBlockLevel(left_tag) or left_tag == "--" \
193
                            and not block.rstrip().endswith(">"):
194
                            items.append(block.strip())
195
                            in_tag = True
196
                        else:
197
                            new_blocks.append(
198
                            self.markdown.htmlStash.store(block.strip()))
199
200
                        continue
201
202
                new_blocks.append(block)
203
204
            else:
205
                items.append(block)
206
207
                right_tag, data_index = self._get_right_tag(left_tag, 
208
                                                            left_index, 
209
                                                            block)
210
211
                if self._equal_tags(left_tag, right_tag):
212
                    # if find closing tag
213
                    in_tag = False
214
                    if self.markdown_in_raw and 'markdown' in attrs.keys():
215
                        start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
216
                                       '', items[0][:left_index])
217
                        items[0] = items[0][left_index:]
218
                        end = items[-1][-len(right_tag)-2:]
219
                        items[-1] = items[-1][:-len(right_tag)-2]
220
                        new_blocks.append(
221
                            self.markdown.htmlStash.store(start))
222
                        new_blocks.extend(items)
223
                        new_blocks.append(
224
                            self.markdown.htmlStash.store(end))
225
                    else:
226
                        new_blocks.append(
227
                            self.markdown.htmlStash.store('\n\n'.join(items)))
228
                    items = []
229
230
        if items:
231
            if self.markdown_in_raw and 'markdown' in attrs.keys():
232
                start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', 
233
                               '', items[0][:left_index])
234
                items[0] = items[0][left_index:]
235
                end = items[-1][-len(right_tag)-2:]
236
                items[-1] = items[-1][:-len(right_tag)-2]
237
                new_blocks.append(
238
                    self.markdown.htmlStash.store(start))
239
                new_blocks.extend(items)
240
                new_blocks.append(
241
                    self.markdown.htmlStash.store(end))
242
            else:
243
                new_blocks.append(
244
                    self.markdown.htmlStash.store('\n\n'.join(items)))
245
            #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
246
            new_blocks.append('\n')
247
248
        new_text = "\n\n".join(new_blocks)
249
        return new_text.split("\n")
250
251
252
class ReferencePreprocessor(Preprocessor):
253
    """ Remove reference definitions from text and store for later use. """
254
255
    RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
256
257
    def run (self, lines):
258
        new_text = [];
259
        for line in lines:
260
            m = self.RE.match(line)
261
            if m:
262
                id = m.group(2).strip().lower()
263
                link = m.group(3).lstrip('<').rstrip('>')
264
                t = m.group(4).strip()  # potential title
265
                if not t:
266
                    self.markdown.references[id] = (link, t)
267
                elif (len(t) >= 2
268
                      and (t[0] == t[-1] == "\""
269
                           or t[0] == t[-1] == "\'"
270
                           or (t[0] == "(" and t[-1] == ")") ) ):
271
                    self.markdown.references[id] = (link, t[1:-1])
272
                else:
273
                    new_text.append(line)
274
            else:
275
                new_text.append(line)
276
277
        return new_text #+ "\n"