1
"""
2
CORE MARKDOWN BLOCKPARSER
3
=============================================================================
4
5
This parser handles basic parsing of Markdown blocks.  It doesn't concern itself
6
with inline elements such as **bold** or *italics*, but rather just catches 
7
blocks, lists, quotes, etc.
8
9
The BlockParser is made up of a bunch of BlockProssors, each handling a 
10
different type of block. Extensions may add/replace/remove BlockProcessors
11
as they need to alter how markdown blocks are parsed.
12
13
"""
14
15
import logging
16
import re
17
import util
18
from blockparser import BlockParser
19
20
logger =  logging.getLogger('MARKDOWN')
21
22
23
def build_block_parser(md_instance, **kwargs):
24
    """ Build the default block parser used by Markdown. """
25
    parser = BlockParser(md_instance)
26
    parser.blockprocessors['empty'] = EmptyBlockProcessor(parser)
27
    parser.blockprocessors['indent'] = ListIndentProcessor(parser)
28
    parser.blockprocessors['code'] = CodeBlockProcessor(parser)
29
    parser.blockprocessors['hashheader'] = HashHeaderProcessor(parser)
30
    parser.blockprocessors['setextheader'] = SetextHeaderProcessor(parser)
31
    parser.blockprocessors['hr'] = HRProcessor(parser)
32
    parser.blockprocessors['olist'] = OListProcessor(parser)
33
    parser.blockprocessors['ulist'] = UListProcessor(parser)
34
    parser.blockprocessors['quote'] = BlockQuoteProcessor(parser)
35
    parser.blockprocessors['paragraph'] = ParagraphProcessor(parser)
36
    return parser
37
38
39
class BlockProcessor:
40
    """ Base class for block processors. 
41
    
42
    Each subclass will provide the methods below to work with the source and
43
    tree. Each processor will need to define it's own ``test`` and ``run``
44
    methods. The ``test`` method should return True or False, to indicate
45
    whether the current block should be processed by this processor. If the
46
    test passes, the parser will call the processors ``run`` method.
47
48
    """
49
50
    def __init__(self, parser):
51
        self.parser = parser
52
        self.tab_length = parser.markdown.tab_length
53
54
    def lastChild(self, parent):
55
        """ Return the last child of an etree element. """
56
        if len(parent):
57
            return parent[-1]
58
        else:
59
            return None
60
61
    def detab(self, text):
62
        """ Remove a tab from the front of each line of the given text. """
63
        newtext = []
64
        lines = text.split('\n')
65
        for line in lines:
66
            if line.startswith(' '*self.tab_length):
67
                newtext.append(line[self.tab_length:])
68
            elif not line.strip():
69
                newtext.append('')
70
            else:
71
                break
72
        return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
73
74
    def looseDetab(self, text, level=1):
75
        """ Remove a tab from front of lines but allowing dedented lines. """
76
        lines = text.split('\n')
77
        for i in range(len(lines)):
78
            if lines[i].startswith(' '*self.tab_length*level):
79
                lines[i] = lines[i][self.tab_length*level:]
80
        return '\n'.join(lines)
81
82
    def test(self, parent, block):
83
        """ Test for block type. Must be overridden by subclasses. 
84
        
85
        As the parser loops through processors, it will call the ``test`` method
86
        on each to determine if the given block of text is of that type. This
87
        method must return a boolean ``True`` or ``False``. The actual method of
88
        testing is left to the needs of that particular block type. It could 
89
        be as simple as ``block.startswith(some_string)`` or a complex regular
90
        expression. As the block type may be different depending on the parent
91
        of the block (i.e. inside a list), the parent etree element is also 
92
        provided and may be used as part of the test.
93
94
        Keywords:
95
        
96
        * ``parent``: A etree element which will be the parent of the block.
97
        * ``block``: A block of text from the source which has been split at 
98
            blank lines.
99
        """
100
        pass
101
102
    def run(self, parent, blocks):
103
        """ Run processor. Must be overridden by subclasses. 
104
        
105
        When the parser determines the appropriate type of a block, the parser
106
        will call the corresponding processor's ``run`` method. This method
107
        should parse the individual lines of the block and append them to
108
        the etree. 
109
110
        Note that both the ``parent`` and ``etree`` keywords are pointers
111
        to instances of the objects which should be edited in place. Each
112
        processor must make changes to the existing objects as there is no
113
        mechanism to return new/different objects to replace them.
114
115
        This means that this method should be adding SubElements or adding text
116
        to the parent, and should remove (``pop``) or add (``insert``) items to
117
        the list of blocks.
118
119
        Keywords:
120
121
        * ``parent``: A etree element which is the parent of the current block.
122
        * ``blocks``: A list of all remaining blocks of the document.
123
        """
124
        pass
125
126
127
class ListIndentProcessor(BlockProcessor):
128
    """ Process children of list items. 
129
    
130
    Example:
131
        * a list item
132
            process this part
133
134
            or this part
135
136
    """
137
138
    ITEM_TYPES = ['li']
139
    LIST_TYPES = ['ul', 'ol']
140
141
    def __init__(self, *args):
142
        BlockProcessor.__init__(self, *args)
143
        self.INDENT_RE = re.compile(r'^(([ ]{%s})+)'% self.tab_length)
144
145
    def test(self, parent, block):
146
        return block.startswith(' '*self.tab_length) and \
147
                not self.parser.state.isstate('detabbed') and  \
148
                (parent.tag in self.ITEM_TYPES or \
149
                    (len(parent) and parent[-1] and \
150
                        (parent[-1].tag in self.LIST_TYPES)
151
                    )
152
                )
153
154
    def run(self, parent, blocks):
155
        block = blocks.pop(0)
156
        level, sibling = self.get_level(parent, block)
157
        block = self.looseDetab(block, level)
158
159
        self.parser.state.set('detabbed')
160
        if parent.tag in self.ITEM_TYPES:
161
            # It's possible that this parent has a 'ul' or 'ol' child list
162
            # with a member.  If that is the case, then that should be the
163
            # parent.  This is intended to catch the edge case of an indented 
164
            # list whose first member was parsed previous to this point
165
            # see OListProcessor
166
            if len(parent) and parent[-1].tag in self.LIST_TYPES:
167
                self.parser.parseBlocks(parent[-1], [block])
168
            else:
169
                # The parent is already a li. Just parse the child block.
170
                self.parser.parseBlocks(parent, [block])
171
        elif sibling.tag in self.ITEM_TYPES:
172
            # The sibling is a li. Use it as parent.
173
            self.parser.parseBlocks(sibling, [block])
174
        elif len(sibling) and sibling[-1].tag in self.ITEM_TYPES:
175
            # The parent is a list (``ol`` or ``ul``) which has children.
176
            # Assume the last child li is the parent of this block.
177
            if sibling[-1].text:
178
                # If the parent li has text, that text needs to be moved to a p
179
                # The p must be 'inserted' at beginning of list in the event
180
                # that other children already exist i.e.; a nested sublist.
181
                p = util.etree.Element('p')
182
                p.text = sibling[-1].text
183
                sibling[-1].text = ''
184
                sibling[-1].insert(0, p)
185
            self.parser.parseChunk(sibling[-1], block)
186
        else:
187
            self.create_item(sibling, block)
188
        self.parser.state.reset()
189
190
    def create_item(self, parent, block):
191
        """ Create a new li and parse the block with it as the parent. """
192
        li = util.etree.SubElement(parent, 'li')
193
        self.parser.parseBlocks(li, [block])
194
 
195
    def get_level(self, parent, block):
196
        """ Get level of indent based on list level. """
197
        # Get indent level
198
        m = self.INDENT_RE.match(block)
199
        if m:
200
            indent_level = len(m.group(1))/self.tab_length
201
        else:
202
            indent_level = 0
203
        if self.parser.state.isstate('list'):
204
            # We're in a tightlist - so we already are at correct parent.
205
            level = 1
206
        else:
207
            # We're in a looselist - so we need to find parent.
208
            level = 0
209
        # Step through children of tree to find matching indent level.
210
        while indent_level > level:
211
            child = self.lastChild(parent)
212
            if child and (child.tag in self.LIST_TYPES or child.tag in self.ITEM_TYPES):
213
                if child.tag in self.LIST_TYPES:
214
                    level += 1
215
                parent = child
216
            else:
217
                # No more child levels. If we're short of indent_level,
218
                # we have a code block. So we stop here.
219
                break
220
        return level, parent
221
222
223
class CodeBlockProcessor(BlockProcessor):
224
    """ Process code blocks. """
225
226
    def test(self, parent, block):
227
        return block.startswith(' '*self.tab_length)
228
    
229
    def run(self, parent, blocks):
230
        sibling = self.lastChild(parent)
231
        block = blocks.pop(0)
232
        theRest = ''
233
        if sibling and sibling.tag == "pre" and len(sibling) \
234
                    and sibling[0].tag == "code":
235
            # The previous block was a code block. As blank lines do not start
236
            # new code blocks, append this block to the previous, adding back
237
            # linebreaks removed from the split into a list.
238
            code = sibling[0]
239
            block, theRest = self.detab(block)
240
            code.text = util.AtomicString('%s\n%s\n' % (code.text, block.rstrip()))
241
        else:
242
            # This is a new codeblock. Create the elements and insert text.
243
            pre = util.etree.SubElement(parent, 'pre')
244
            code = util.etree.SubElement(pre, 'code')
245
            block, theRest = self.detab(block)
246
            code.text = util.AtomicString('%s\n' % block.rstrip())
247
        if theRest:
248
            # This block contained unindented line(s) after the first indented 
249
            # line. Insert these lines as the first block of the master blocks
250
            # list for future processing.
251
            blocks.insert(0, theRest)
252
253
254
class BlockQuoteProcessor(BlockProcessor):
255
256
    RE = re.compile(r'(^|\n)[ ]{0,3}>[ ]?(.*)')
257
258
    def test(self, parent, block):
259
        return bool(self.RE.search(block))
260
261
    def run(self, parent, blocks):
262
        block = blocks.pop(0)
263
        m = self.RE.search(block)
264
        if m:
265
            before = block[:m.start()] # Lines before blockquote
266
            # Pass lines before blockquote in recursively for parsing forst.
267
            self.parser.parseBlocks(parent, [before])
268
            # Remove ``> `` from begining of each line.
269
            block = '\n'.join([self.clean(line) for line in 
270
                            block[m.start():].split('\n')])
271
        sibling = self.lastChild(parent)
272
        if sibling and sibling.tag == "blockquote":
273
            # Previous block was a blockquote so set that as this blocks parent
274
            quote = sibling
275
        else:
276
            # This is a new blockquote. Create a new parent element.
277
            quote = util.etree.SubElement(parent, 'blockquote')
278
        # Recursively parse block with blockquote as parent.
279
        # change parser state so blockquotes embedded in lists use p tags
280
        self.parser.state.set('blockquote')
281
        self.parser.parseChunk(quote, block)
282
        self.parser.state.reset()
283
284
    def clean(self, line):
285
        """ Remove ``>`` from beginning of a line. """
286
        m = self.RE.match(line)
287
        if line.strip() == ">":
288
            return ""
289
        elif m:
290
            return m.group(2)
291
        else:
292
            return line
293
294
class OListProcessor(BlockProcessor):
295
    """ Process ordered list blocks. """
296
297
    TAG = 'ol'
298
    # Detect an item (``1. item``). ``group(1)`` contains contents of item.
299
    RE = re.compile(r'^[ ]{0,3}\d+\.[ ]+(.*)')
300
    # Detect items on secondary lines. they can be of either list type.
301
    CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ]+(.*)')
302
    # Detect indented (nested) items of either type
303
    INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ]+.*')
304
    # The integer (python string) with which the lists starts (default=1)
305
    # Eg: If list is intialized as)
306
    #   3. Item
307
    # The ol tag will get starts="3" attribute
308
    STARTSWITH = '1'
309
310
    def test(self, parent, block):
311
        return bool(self.RE.match(block))
312
313
    def run(self, parent, blocks):
314
        # Check fr multiple items in one block.
315
        items = self.get_items(blocks.pop(0))
316
        sibling = self.lastChild(parent)
317
318
        if sibling and sibling.tag in ['ol', 'ul']:
319
            # Previous block was a list item, so set that as parent
320
            lst = sibling
321
            # make sure previous item is in a p- if the item has text, then it
322
            # it isn't in a p
323
            if lst[-1].text: 
324
                # since it's possible there are other children for this sibling,
325
                # we can't just SubElement the p, we need to insert it as the 
326
                # first item
327
                p = util.etree.Element('p')
328
                p.text = lst[-1].text
329
                lst[-1].text = ''
330
                lst[-1].insert(0, p)
331
332
            # parse first block differently as it gets wrapped in a p.
333
            li = util.etree.SubElement(lst, 'li')
334
            self.parser.state.set('looselist')
335
            firstitem = items.pop(0)
336
            self.parser.parseBlocks(li, [firstitem])
337
            self.parser.state.reset()
338
        elif parent.tag in ['ol', 'ul']:
339
            # this catches the edge case of a multi-item indented list whose 
340
            # first item is in a blank parent-list item:
341
            # * * subitem1
342
            #     * subitem2
343
            # see also ListIndentProcessor
344
            lst = parent
345
        else:
346
            # This is a new list so create parent with appropriate tag.
347
            lst = util.etree.SubElement(parent, self.TAG)
348
            # Check if a custom start integer is set
349
            if not self.parser.markdown.lazy_ol and self.STARTSWITH !='1':
350
                lst.attrib['start'] = self.STARTSWITH
351
352
        self.parser.state.set('list')
353
        # Loop through items in block, recursively parsing each with the
354
        # appropriate parent.
355
        for item in items:
356
            if item.startswith(' '*self.tab_length):
357
                # Item is indented. Parse with last item as parent
358
                self.parser.parseBlocks(lst[-1], [item])
359
            else:
360
                # New item. Create li and parse with it as parent
361
                li = util.etree.SubElement(lst, 'li')
362
                self.parser.parseBlocks(li, [item])
363
        self.parser.state.reset()
364
365
    def get_items(self, block):
366
        """ Break a block into list items. """
367
        items = []
368
        for line in block.split('\n'):
369
            m = self.CHILD_RE.match(line)
370
            if m:
371
                # This is a new list item
372
                # Check first item for the start index
373
                if not items and self.TAG=='ol':
374
                    # Detect the integer value of first list item
375
                    INTEGER_RE = re.compile('(\d+)')
376
                    self.STARTSWITH = INTEGER_RE.match(m.group(1)).group()
377
                # Append to the list
378
                items.append(m.group(3))
379
            elif self.INDENT_RE.match(line):
380
                # This is an indented (possibly nested) item.
381
                if items[-1].startswith(' '*self.tab_length):
382
                    # Previous item was indented. Append to that item.
383
                    items[-1] = '%s\n%s' % (items[-1], line)
384
                else:
385
                    items.append(line)
386
            else:
387
                # This is another line of previous item. Append to that item.
388
                items[-1] = '%s\n%s' % (items[-1], line)
389
        return items
390
391
392
class UListProcessor(OListProcessor):
393
    """ Process unordered list blocks. """
394
395
    TAG = 'ul'
396
    RE = re.compile(r'^[ ]{0,3}[*+-][ ]+(.*)')
397
398
399
class HashHeaderProcessor(BlockProcessor):
400
    """ Process Hash Headers. """
401
402
    # Detect a header at start of any line in block
403
    RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
404
405
    def test(self, parent, block):
406
        return bool(self.RE.search(block))
407
408
    def run(self, parent, blocks):
409
        block = blocks.pop(0)
410
        m = self.RE.search(block)
411
        if m:
412
            before = block[:m.start()] # All lines before header
413
            after = block[m.end():]    # All lines after header
414
            if before:
415
                # As the header was not the first line of the block and the
416
                # lines before the header must be parsed first,
417
                # recursively parse this lines as a block.
418
                self.parser.parseBlocks(parent, [before])
419
            # Create header using named groups from RE
420
            h = util.etree.SubElement(parent, 'h%d' % len(m.group('level')))
421
            h.text = m.group('header').strip()
422
            if after:
423
                # Insert remaining lines as first block for future parsing.
424
                blocks.insert(0, after)
425
        else:
426
            # This should never happen, but just in case...
427
            logger.warn("We've got a problem header: %r" % block)
428
429
430
class SetextHeaderProcessor(BlockProcessor):
431
    """ Process Setext-style Headers. """
432
433
    # Detect Setext-style header. Must be first 2 lines of block.
434
    RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE)
435
436
    def test(self, parent, block):
437
        return bool(self.RE.match(block))
438
439
    def run(self, parent, blocks):
440
        lines = blocks.pop(0).split('\n')
441
        # Determine level. ``=`` is 1 and ``-`` is 2.
442
        if lines[1].startswith('='):
443
            level = 1
444
        else:
445
            level = 2
446
        h = util.etree.SubElement(parent, 'h%d' % level)
447
        h.text = lines[0].strip()
448
        if len(lines) > 2:
449
            # Block contains additional lines. Add to  master blocks for later.
450
            blocks.insert(0, '\n'.join(lines[2:]))
451
452
453
class HRProcessor(BlockProcessor):
454
    """ Process Horizontal Rules. """
455
456
    RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*'
457
    # Detect hr on any line of a block.
458
    SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE)
459
    # Match a hr on a single line of text.
460
    MATCH_RE = re.compile(r'^%s$' % RE)
461
462
    def test(self, parent, block):
463
        return bool(self.SEARCH_RE.search(block))
464
465
    def run(self, parent, blocks):
466
        lines = blocks.pop(0).split('\n')
467
        prelines = []
468
        # Check for lines in block before hr.
469
        for line in lines:
470
            m = self.MATCH_RE.match(line)
471
            if m:
472
                break
473
            else:
474
                prelines.append(line)
475
        if len(prelines):
476
            # Recursively parse lines before hr so they get parsed first.
477
            self.parser.parseBlocks(parent, ['\n'.join(prelines)])
478
        # create hr
479
        hr = util.etree.SubElement(parent, 'hr')
480
        # check for lines in block after hr.
481
        lines = lines[len(prelines)+1:]
482
        if len(lines):
483
            # Add lines after hr to master blocks for later parsing.
484
            blocks.insert(0, '\n'.join(lines))
485
486
487
class EmptyBlockProcessor(BlockProcessor):
488
    """ Process blocks and start with an empty line. """
489
490
    # Detect a block that only contains whitespace 
491
    # or only whitespace on the first line.
492
    RE = re.compile(r'^\s*\n')
493
494
    def test(self, parent, block):
495
        return bool(self.RE.match(block))
496
497
    def run(self, parent, blocks):
498
        block = blocks.pop(0)
499
        m = self.RE.match(block)
500
        if m:
501
            # Add remaining line to master blocks for later.
502
            blocks.insert(0, block[m.end():])
503
            sibling = self.lastChild(parent)
504
            if sibling and sibling.tag == 'pre' and sibling[0] and \
505
                    sibling[0].tag == 'code':
506
                # Last block is a codeblock. Append to preserve whitespace.
507
                sibling[0].text = util.AtomicString('%s/n/n/n' % sibling[0].text )
508
509
510
class ParagraphProcessor(BlockProcessor):
511
    """ Process Paragraph blocks. """
512
513
    def test(self, parent, block):
514
        return True
515
516
    def run(self, parent, blocks):
517
        block = blocks.pop(0)
518
        if block.strip():
519
            # Not a blank block. Add to parent, otherwise throw it away.
520
            if self.parser.state.isstate('list'):
521
                # The parent is a tight-list. Append to parent.text
522
                if parent.text:
523
                    parent.text = '%s\n%s' % (parent.text, block)
524
                else:
525
                    parent.text = block.lstrip()
526
            else:
527
                # Create a regular paragraph
528
                p = util.etree.SubElement(parent, 'p')
529
                p.text = block.lstrip()