| 1 |
""" |
| 2 |
PRE-PROCESSORS |
| 3 |
============================================================================= |
| 4 |
|
| 5 |
Preprocessors work on source text before we start doing anything too |
| 6 |
complicated. |
| 7 |
""" |
| 8 |
|
| 9 |
import re |
| 10 |
import util |
| 11 |
import odict |
| 12 |
|
| 13 |
|
| 14 |
def build_preprocessors(md_instance, **kwargs): |
| 15 |
""" Build the default set of preprocessors used by Markdown. """ |
| 16 |
preprocessors = odict.OrderedDict() |
| 17 |
preprocessors["html_block"] = HtmlBlockPreprocessor(md_instance) |
| 18 |
preprocessors["reference"] = ReferencePreprocessor(md_instance) |
| 19 |
return preprocessors |
| 20 |
|
| 21 |
|
| 22 |
class Preprocessor(util.Processor): |
| 23 |
""" |
| 24 |
Preprocessors are run after the text is broken into lines. |
| 25 |
|
| 26 |
Each preprocessor implements a "run" method that takes a pointer to a |
| 27 |
list of lines of the document, modifies it as necessary and returns |
| 28 |
either the same pointer or a pointer to a new list. |
| 29 |
|
| 30 |
Preprocessors must extend markdown.Preprocessor. |
| 31 |
|
| 32 |
""" |
| 33 |
def run(self, lines): |
| 34 |
""" |
| 35 |
Each subclass of Preprocessor should override the `run` method, which |
| 36 |
takes the document as a list of strings split by newlines and returns |
| 37 |
the (possibly modified) list of lines. |
| 38 |
|
| 39 |
""" |
| 40 |
pass |
| 41 |
|
| 42 |
|
| 43 |
class HtmlBlockPreprocessor(Preprocessor): |
| 44 |
"""Remove html blocks from the text and store them for later retrieval.""" |
| 45 |
|
| 46 |
right_tag_patterns = ["</%s>", "%s>"] |
| 47 |
attrs_pattern = r""" |
| 48 |
\s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" |
| 49 |
| # OR |
| 50 |
\s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value |
| 51 |
| # OR |
| 52 |
\s+(?P<attr2>[^>"'/= ]+) # attr |
| 53 |
""" |
| 54 |
left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern |
| 55 |
attrs_re = re.compile(attrs_pattern, re.VERBOSE) |
| 56 |
left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) |
| 57 |
markdown_in_raw = False |
| 58 |
|
| 59 |
def _get_left_tag(self, block): |
| 60 |
m = self.left_tag_re.match(block) |
| 61 |
if m: |
| 62 |
tag = m.group('tag') |
| 63 |
raw_attrs = m.group('attrs') |
| 64 |
attrs = {} |
| 65 |
if raw_attrs: |
| 66 |
for ma in self.attrs_re.finditer(raw_attrs): |
| 67 |
if ma.group('attr'): |
| 68 |
if ma.group('value'): |
| 69 |
attrs[ma.group('attr').strip()] = ma.group('value') |
| 70 |
else: |
| 71 |
attrs[ma.group('attr').strip()] = "" |
| 72 |
elif ma.group('attr1'): |
| 73 |
if ma.group('value1'): |
| 74 |
attrs[ma.group('attr1').strip()] = ma.group('value1') |
| 75 |
else: |
| 76 |
attrs[ma.group('attr1').strip()] = "" |
| 77 |
elif ma.group('attr2'): |
| 78 |
attrs[ma.group('attr2').strip()] = "" |
| 79 |
return tag, len(m.group(0)), attrs |
| 80 |
else: |
| 81 |
tag = block[1:].replace(">", " ", 1).split()[0].lower() |
| 82 |
return tag, len(tag)+2, {} |
| 83 |
|
| 84 |
def _recursive_tagfind(self, ltag, rtag, start_index, block): |
| 85 |
while 1: |
| 86 |
i = block.find(rtag, start_index) |
| 87 |
if i == -1: |
| 88 |
return -1 |
| 89 |
j = block.find(ltag, start_index) |
| 90 |
# if no ltag, or rtag found before another ltag, return index |
| 91 |
if (j > i or j == -1): |
| 92 |
return i + len(rtag) |
| 93 |
# another ltag found before rtag, use end of ltag as starting |
| 94 |
# point and search again |
| 95 |
j = block.find('>', j) |
| 96 |
start_index = self._recursive_tagfind(ltag, rtag, j + 1, block) |
| 97 |
if start_index == -1: |
| 98 |
# HTML potentially malformed- ltag has no corresponding |
| 99 |
# rtag |
| 100 |
return -1 |
| 101 |
|
| 102 |
def _get_right_tag(self, left_tag, left_index, block): |
| 103 |
for p in self.right_tag_patterns: |
| 104 |
tag = p % left_tag |
| 105 |
i = self._recursive_tagfind("<%s" % left_tag, tag, left_index, block) |
| 106 |
if i > 2: |
| 107 |
return tag.lstrip("<").rstrip(">"), i |
| 108 |
return block.rstrip()[-left_index:-1].lower(), len(block) |
| 109 |
|
| 110 |
def _equal_tags(self, left_tag, right_tag): |
| 111 |
if left_tag[0] in ['?', '@', '%']: # handle PHP, etc. |
| 112 |
return True |
| 113 |
if ("/" + left_tag) == right_tag: |
| 114 |
return True |
| 115 |
if (right_tag == "--" and left_tag == "--"): |
| 116 |
return True |
| 117 |
elif left_tag == right_tag[1:] \ |
| 118 |
and right_tag[0] != "<": |
| 119 |
return True |
| 120 |
else: |
| 121 |
return False |
| 122 |
|
| 123 |
def _is_oneliner(self, tag): |
| 124 |
return (tag in ['hr', 'hr/']) |
| 125 |
|
| 126 |
def run(self, lines): |
| 127 |
text = "\n".join(lines) |
| 128 |
new_blocks = [] |
| 129 |
text = text.split("\n\n") |
| 130 |
items = [] |
| 131 |
left_tag = '' |
| 132 |
right_tag = '' |
| 133 |
in_tag = False # flag |
| 134 |
|
| 135 |
while text: |
| 136 |
block = text[0] |
| 137 |
if block.startswith("\n"): |
| 138 |
block = block[1:] |
| 139 |
text = text[1:] |
| 140 |
|
| 141 |
if block.startswith("\n"): |
| 142 |
block = block[1:] |
| 143 |
|
| 144 |
if not in_tag: |
| 145 |
if block.startswith("<") and len(block.strip()) > 1: |
| 146 |
left_tag, left_index, attrs = self._get_left_tag(block) |
| 147 |
right_tag, data_index = self._get_right_tag(left_tag, |
| 148 |
left_index, |
| 149 |
block) |
| 150 |
|
| 151 |
if block[1] == "!": |
| 152 |
# is a comment block |
| 153 |
left_tag = "--" |
| 154 |
right_tag, data_index = self._get_right_tag(left_tag, |
| 155 |
left_index, |
| 156 |
block) |
| 157 |
# keep checking conditions below and maybe just append |
| 158 |
|
| 159 |
if data_index < len(block) \ |
| 160 |
and util.isBlockLevel(left_tag): |
| 161 |
text.insert(0, block[data_index:]) |
| 162 |
block = block[:data_index] |
| 163 |
|
| 164 |
if not (util.isBlockLevel(left_tag) \ |
| 165 |
or block[1] in ["!", "?", "@", "%"]): |
| 166 |
new_blocks.append(block) |
| 167 |
continue |
| 168 |
|
| 169 |
if self._is_oneliner(left_tag): |
| 170 |
new_blocks.append(block.strip()) |
| 171 |
continue |
| 172 |
|
| 173 |
if block.rstrip().endswith(">") \ |
| 174 |
and self._equal_tags(left_tag, right_tag): |
| 175 |
if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| 176 |
start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', |
| 177 |
'', block[:left_index]) |
| 178 |
end = block[-len(right_tag)-2:] |
| 179 |
block = block[left_index:-len(right_tag)-2] |
| 180 |
new_blocks.append( |
| 181 |
self.markdown.htmlStash.store(start)) |
| 182 |
new_blocks.append(block) |
| 183 |
new_blocks.append( |
| 184 |
self.markdown.htmlStash.store(end)) |
| 185 |
else: |
| 186 |
new_blocks.append( |
| 187 |
self.markdown.htmlStash.store(block.strip())) |
| 188 |
continue |
| 189 |
else: |
| 190 |
# if is block level tag and is not complete |
| 191 |
|
| 192 |
if util.isBlockLevel(left_tag) or left_tag == "--" \ |
| 193 |
and not block.rstrip().endswith(">"): |
| 194 |
items.append(block.strip()) |
| 195 |
in_tag = True |
| 196 |
else: |
| 197 |
new_blocks.append( |
| 198 |
self.markdown.htmlStash.store(block.strip())) |
| 199 |
|
| 200 |
continue |
| 201 |
|
| 202 |
new_blocks.append(block) |
| 203 |
|
| 204 |
else: |
| 205 |
items.append(block) |
| 206 |
|
| 207 |
right_tag, data_index = self._get_right_tag(left_tag, |
| 208 |
left_index, |
| 209 |
block) |
| 210 |
|
| 211 |
if self._equal_tags(left_tag, right_tag): |
| 212 |
# if find closing tag |
| 213 |
in_tag = False |
| 214 |
if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| 215 |
start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', |
| 216 |
'', items[0][:left_index]) |
| 217 |
items[0] = items[0][left_index:] |
| 218 |
end = items[-1][-len(right_tag)-2:] |
| 219 |
items[-1] = items[-1][:-len(right_tag)-2] |
| 220 |
new_blocks.append( |
| 221 |
self.markdown.htmlStash.store(start)) |
| 222 |
new_blocks.extend(items) |
| 223 |
new_blocks.append( |
| 224 |
self.markdown.htmlStash.store(end)) |
| 225 |
else: |
| 226 |
new_blocks.append( |
| 227 |
self.markdown.htmlStash.store('\n\n'.join(items))) |
| 228 |
items = [] |
| 229 |
|
| 230 |
if items: |
| 231 |
if self.markdown_in_raw and 'markdown' in attrs.keys(): |
| 232 |
start = re.sub(r'\smarkdown(=[\'"]?[^> ]*[\'"]?)?', |
| 233 |
'', items[0][:left_index]) |
| 234 |
items[0] = items[0][left_index:] |
| 235 |
end = items[-1][-len(right_tag)-2:] |
| 236 |
items[-1] = items[-1][:-len(right_tag)-2] |
| 237 |
new_blocks.append( |
| 238 |
self.markdown.htmlStash.store(start)) |
| 239 |
new_blocks.extend(items) |
| 240 |
new_blocks.append( |
| 241 |
self.markdown.htmlStash.store(end)) |
| 242 |
else: |
| 243 |
new_blocks.append( |
| 244 |
self.markdown.htmlStash.store('\n\n'.join(items))) |
| 245 |
#new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) |
| 246 |
new_blocks.append('\n') |
| 247 |
|
| 248 |
new_text = "\n\n".join(new_blocks) |
| 249 |
return new_text.split("\n") |
| 250 |
|
| 251 |
|
| 252 |
class ReferencePreprocessor(Preprocessor): |
| 253 |
""" Remove reference definitions from text and store for later use. """ |
| 254 |
|
| 255 |
RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL) |
| 256 |
|
| 257 |
def run (self, lines): |
| 258 |
new_text = []; |
| 259 |
for line in lines: |
| 260 |
m = self.RE.match(line) |
| 261 |
if m: |
| 262 |
id = m.group(2).strip().lower() |
| 263 |
link = m.group(3).lstrip('<').rstrip('>') |
| 264 |
t = m.group(4).strip() # potential title |
| 265 |
if not t: |
| 266 |
self.markdown.references[id] = (link, t) |
| 267 |
elif (len(t) >= 2 |
| 268 |
and (t[0] == t[-1] == "\"" |
| 269 |
or t[0] == t[-1] == "\'" |
| 270 |
or (t[0] == "(" and t[-1] == ")") ) ): |
| 271 |
self.markdown.references[id] = (link, t[1:-1]) |
| 272 |
else: |
| 273 |
new_text.append(line) |
| 274 |
else: |
| 275 |
new_text.append(line) |
| 276 |
|
| 277 |
return new_text #+ "\n" |