1
"""
2
INLINE PATTERNS
3
=============================================================================
4
5
Inline patterns such as *emphasis* are handled by means of auxiliary
6
objects, one per pattern.  Pattern objects must be instances of classes
7
that extend markdown.Pattern.  Each pattern object uses a single regular
8
expression and needs support the following methods:
9
10
    pattern.getCompiledRegExp() # returns a regular expression
11
12
    pattern.handleMatch(m) # takes a match object and returns
13
                           # an ElementTree element or just plain text
14
15
All of python markdown's built-in patterns subclass from Pattern,
16
but you can add additional patterns that don't.
17
18
Also note that all the regular expressions used by inline must
19
capture the whole block.  For this reason, they all start with
20
'^(.*)' and end with '(.*)!'.  In case with built-in expression
21
Pattern takes care of adding the "^(.*)" and "(.*)!".
22
23
Finally, the order in which regular expressions are applied is very
24
important - e.g. if we first replace http://.../ links with <a> tags
25
and _then_ try to replace inline html, we would end up with a mess.
26
So, we apply the expressions in the following order:
27
28
* escape and backticks have to go before everything else, so
29
  that we can preempt any markdown patterns by escaping them.
30
31
* then we handle auto-links (must be done before inline html)
32
33
* then we handle inline HTML.  At this point we will simply
34
  replace all inline HTML strings with a placeholder and add
35
  the actual HTML to a hash.
36
37
* then inline images (must be done before links)
38
39
* then bracketed links, first regular then reference-style
40
41
* finally we apply strong and emphasis
42
"""
43
44
import util
45
import odict
46
import re
47
from urlparse import urlparse, urlunparse
48
import sys
49
# If you see an ImportError for htmlentitydefs after using 2to3 to convert for 
50
# use by Python3, then you are probably using the buggy version from Python 3.0.
51
# We recomend using the tool from Python 3.1 even if you will be running the 
52
# code on Python 3.0.  The following line should be converted by the tool to:
53
# `from html import entities` and later calls to `htmlentitydefs` should be
54
# changed to call `entities`. Python 3.1's tool does this but 3.0's does not.
55
import htmlentitydefs
56
57
58
def build_inlinepatterns(md_instance, **kwargs):
59
    """ Build the default set of inline patterns for Markdown. """
60
    inlinePatterns = odict.OrderedDict()
61
    inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
62
    inlinePatterns["escape"] = SimpleTextPattern(ESCAPE_RE)
63
    inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
64
    inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
65
    inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
66
    inlinePatterns["image_reference"] = \
67
            ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)
68
    inlinePatterns["short_reference"] = \
69
            ReferencePattern(SHORT_REF_RE, md_instance)
70
    inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
71
    inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
72
    inlinePatterns["linebreak2"] = SubstituteTagPattern(LINE_BREAK_2_RE, 'br')
73
    inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
74
    inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
75
    inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
76
    inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
77
    inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
78
    inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
79
    inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
80
    if md_instance.smart_emphasis:
81
        inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
82
    else:
83
        inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
84
    return inlinePatterns
85
86
"""
87
The actual regular expressions for patterns
88
-----------------------------------------------------------------------------
89
"""
90
91
NOBRACKET = r'[^\]\[]*'
92
BRK = ( r'\[('
93
        + (NOBRACKET + r'(\[')*6
94
        + (NOBRACKET+ r'\])*')*6
95
        + NOBRACKET + r')\]' )
96
NOIMG = r'(?<!\!)'
97
98
BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
99
ESCAPE_RE = r'\\(.)'                             # \<
100
EMPHASIS_RE = r'(\*)([^\*]+)\2'                    # *emphasis*
101
STRONG_RE = r'(\*{2}|_{2})(.+?)\2'                      # **strong**
102
STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2'            # ***strong***
103
SMART_EMPHASIS_RE = r'(?<!\w)(_)(\S.+?)\2(?!\w)'        # _smart_emphasis_
104
EMPHASIS_2_RE = r'(_)(.+?)\2'                 # _emphasis_
105
LINK_RE = NOIMG + BRK + \
106
r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
107
# [text](url) or [text](<url>) or [text](url "title")
108
109
IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
110
# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
111
REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]'           # [Google][3]
112
SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'                   # [Google]
113
IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
114
NOT_STRONG_RE = r'((^| )(\*|_)( |$))'                        # stand-alone * or _
115
AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>'        # <http://www.123.com>
116
AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'               # <me@example.com>
117
118
HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'               # <...>
119
ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'               # &amp;
120
LINE_BREAK_RE = r'  \n'                     # two spaces at end of line
121
LINE_BREAK_2_RE = r'  $'                    # two spaces at end of text
122
123
124
def dequote(string):
125
    """Remove quotes from around a string."""
126
    if ( ( string.startswith('"') and string.endswith('"'))
127
         or (string.startswith("'") and string.endswith("'")) ):
128
        return string[1:-1]
129
    else:
130
        return string
131
132
ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
133
134
def handleAttributes(text, parent):
135
    """Set values of an element based on attribute definitions ({@id=123})."""
136
    def attributeCallback(match):
137
        parent.set(match.group(1), match.group(2).replace('\n', ' '))
138
    return ATTR_RE.sub(attributeCallback, text)
139
140
141
"""
142
The pattern classes
143
-----------------------------------------------------------------------------
144
"""
145
146
class Pattern:
147
    """Base class that inline patterns subclass. """
148
149
    def __init__(self, pattern, markdown_instance=None):
150
        """
151
        Create an instant of an inline pattern.
152
153
        Keyword arguments:
154
155
        * pattern: A regular expression that matches a pattern
156
157
        """
158
        self.pattern = pattern
159
        self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, 
160
                                      re.DOTALL | re.UNICODE)
161
162
        # Api for Markdown to pass safe_mode into instance
163
        self.safe_mode = False
164
        if markdown_instance:
165
            self.markdown = markdown_instance
166
167
    def getCompiledRegExp(self):
168
        """ Return a compiled regular expression. """
169
        return self.compiled_re
170
171
    def handleMatch(self, m):
172
        """Return a ElementTree element from the given match.
173
174
        Subclasses should override this method.
175
176
        Keyword arguments:
177
178
        * m: A re match object containing a match of the pattern.
179
180
        """
181
        pass
182
183
    def type(self):
184
        """ Return class name, to define pattern type """
185
        return self.__class__.__name__
186
187
BasePattern = Pattern # for backward compatibility
188
189
class SimpleTextPattern(Pattern):
190
    """ Return a simple text of group(2) of a Pattern. """
191
    def handleMatch(self, m):
192
        text = m.group(2)
193
        if text == util.INLINE_PLACEHOLDER_PREFIX:
194
            return None
195
        return text
196
197
class SimpleTagPattern(Pattern):
198
    """
199
    Return element of type `tag` with a text attribute of group(3)
200
    of a Pattern.
201
202
    """
203
    def __init__ (self, pattern, tag):
204
        Pattern.__init__(self, pattern)
205
        self.tag = tag
206
207
    def handleMatch(self, m):
208
        el = util.etree.Element(self.tag)
209
        el.text = m.group(3)
210
        return el
211
212
213
class SubstituteTagPattern(SimpleTagPattern):
214
    """ Return a eLement of type `tag` with no children. """
215
    def handleMatch (self, m):
216
        return util.etree.Element(self.tag)
217
218
219
class BacktickPattern(Pattern):
220
    """ Return a `<code>` element containing the matching text. """
221
    def __init__ (self, pattern):
222
        Pattern.__init__(self, pattern)
223
        self.tag = "code"
224
225
    def handleMatch(self, m):
226
        el = util.etree.Element(self.tag)
227
        el.text = util.AtomicString(m.group(3).strip())
228
        return el
229
230
231
class DoubleTagPattern(SimpleTagPattern):
232
    """Return a ElementTree element nested in tag2 nested in tag1.
233
234
    Useful for strong emphasis etc.
235
236
    """
237
    def handleMatch(self, m):
238
        tag1, tag2 = self.tag.split(",")
239
        el1 = util.etree.Element(tag1)
240
        el2 = util.etree.SubElement(el1, tag2)
241
        el2.text = m.group(3)
242
        return el1
243
244
245
class HtmlPattern(Pattern):
246
    """ Store raw inline html and return a placeholder. """
247
    def handleMatch (self, m):
248
        rawhtml = m.group(2)
249
        inline = True
250
        place_holder = self.markdown.htmlStash.store(rawhtml)
251
        return place_holder
252
253
254
class LinkPattern(Pattern):
255
    """ Return a link element from the given match. """
256
    def handleMatch(self, m):
257
        el = util.etree.Element("a")
258
        el.text = m.group(2)
259
        title = m.group(13)
260
        href = m.group(9)
261
262
        if href:
263
            if href[0] == "<":
264
                href = href[1:-1]
265
            el.set("href", self.sanitize_url(href.strip()))
266
        else:
267
            el.set("href", "")
268
269
        if title:
270
            title = dequote(title) #.replace('"', "&quot;")
271
            el.set("title", title)
272
        return el
273
274
    def sanitize_url(self, url):
275
        """
276
        Sanitize a url against xss attacks in "safe_mode".
277
278
        Rather than specifically blacklisting `javascript:alert("XSS")` and all
279
        its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
280
        safe url formats. Most urls contain a network location, however some
281
        are known not to (i.e.: mailto links). Script urls do not contain a
282
        location. Additionally, for `javascript:...`, the scheme would be
283
        "javascript" but some aliases will appear to `urlparse()` to have no
284
        scheme. On top of that relative links (i.e.: "foo/bar.html") have no
285
        scheme. Therefore we must check "path", "parameters", "query" and
286
        "fragment" for any literal colons. We don't check "scheme" for colons
287
        because it *should* never have any and "netloc" must allow the form:
288
        `username:password@host:port`.
289
290
        """
291
        locless_schemes = ['', 'mailto', 'news']
292
        scheme, netloc, path, params, query, fragment = url = urlparse(url)
293
        safe_url = False
294
        if netloc != '' or scheme in locless_schemes:
295
            safe_url = True
296
297
        for part in url[2:]:
298
            if ":" in part:
299
                safe_url = False
300
301
        if self.markdown.safeMode and not safe_url:
302
            return ''
303
        else:
304
            return urlunparse(url)
305
306
class ImagePattern(LinkPattern):
307
    """ Return a img element from the given match. """
308
    def handleMatch(self, m):
309
        el = util.etree.Element("img")
310
        src_parts = m.group(9).split()
311
        if src_parts:
312
            src = src_parts[0]
313
            if src[0] == "<" and src[-1] == ">":
314
                src = src[1:-1]
315
            el.set('src', self.sanitize_url(src))
316
        else:
317
            el.set('src', "")
318
        if len(src_parts) > 1:
319
            el.set('title', dequote(" ".join(src_parts[1:])))
320
321
        if self.markdown.enable_attributes:
322
            truealt = handleAttributes(m.group(2), el)
323
        else:
324
            truealt = m.group(2)
325
326
        el.set('alt', truealt)
327
        return el
328
329
class ReferencePattern(LinkPattern):
330
    """ Match to a stored reference and return link element. """
331
332
    NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
333
334
    def handleMatch(self, m):
335
        try:
336
            id = m.group(9).lower()
337
        except IndexError:
338
            id = None
339
        if not id:
340
            # if we got something like "[Google][]" or "[Goggle]"
341
            # we'll use "google" as the id
342
            id = m.group(2).lower()
343
344
        # Clean up linebreaks in id
345
        id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
346
        if not id in self.markdown.references: # ignore undefined refs
347
            return None
348
        href, title = self.markdown.references[id]
349
350
        text = m.group(2)
351
        return self.makeTag(href, title, text)
352
353
    def makeTag(self, href, title, text):
354
        el = util.etree.Element('a')
355
356
        el.set('href', self.sanitize_url(href))
357
        if title:
358
            el.set('title', title)
359
360
        el.text = text
361
        return el
362
363
364
class ImageReferencePattern(ReferencePattern):
365
    """ Match to a stored reference and return img element. """
366
    def makeTag(self, href, title, text):
367
        el = util.etree.Element("img")
368
        el.set("src", self.sanitize_url(href))
369
        if title:
370
            el.set("title", title)
371
        el.set("alt", text)
372
        return el
373
374
375
class AutolinkPattern(Pattern):
376
    """ Return a link Element given an autolink (`<http://example/com>`). """
377
    def handleMatch(self, m):
378
        el = util.etree.Element("a")
379
        el.set('href', m.group(2))
380
        el.text = util.AtomicString(m.group(2))
381
        return el
382
383
class AutomailPattern(Pattern):
384
    """
385
    Return a mailto link Element given an automail link (`<foo@example.com>`).
386
    """
387
    def handleMatch(self, m):
388
        el = util.etree.Element('a')
389
        email = m.group(2)
390
        if email.startswith("mailto:"):
391
            email = email[len("mailto:"):]
392
393
        def codepoint2name(code):
394
            """Return entity definition by code, or the code if not defined."""
395
            entity = htmlentitydefs.codepoint2name.get(code)
396
            if entity:
397
                return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
398
            else:
399
                return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
400
401
        letters = [codepoint2name(ord(letter)) for letter in email]
402
        el.text = util.AtomicString(''.join(letters))
403
404
        mailto = "mailto:" + email
405
        mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
406
                          ord(letter) for letter in mailto])
407
        el.set('href', mailto)
408
        return el