1
# markdown/html4.py
2
#
3
# Add html4 serialization to older versions of Elementree
4
# Taken from ElementTree 1.3 preview with slight modifications
5
#
6
# Copyright (c) 1999-2007 by Fredrik Lundh.  All rights reserved.
7
#
8
# fredrik@pythonware.com
9
# http://www.pythonware.com
10
#
11
# --------------------------------------------------------------------
12
# The ElementTree toolkit is
13
#
14
# Copyright (c) 1999-2007 by Fredrik Lundh
15
#
16
# By obtaining, using, and/or copying this software and/or its
17
# associated documentation, you agree that you have read, understood,
18
# and will comply with the following terms and conditions:
19
#
20
# Permission to use, copy, modify, and distribute this software and
21
# its associated documentation for any purpose and without fee is
22
# hereby granted, provided that the above copyright notice appears in
23
# all copies, and that both that copyright notice and this permission
24
# notice appear in supporting documentation, and that the name of
25
# Secret Labs AB or the author not be used in advertising or publicity
26
# pertaining to distribution of the software without specific, written
27
# prior permission.
28
#
29
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
30
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
31
# ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
32
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
33
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
34
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
35
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
36
# OF THIS SOFTWARE.
37
# --------------------------------------------------------------------
38
39
40
import util
41
ElementTree = util.etree.ElementTree
42
QName = util.etree.QName
43
if hasattr(util.etree, 'test_comment'):
44
    Comment = util.etree.test_comment
45
else:
46
    Comment = util.etree.Comment
47
PI = util.etree.PI
48
ProcessingInstruction = util.etree.ProcessingInstruction
49
50
HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
51
              "img", "input", "isindex", "link", "meta" "param")
52
53
try:
54
    HTML_EMPTY = set(HTML_EMPTY)
55
except NameError:
56
    pass
57
58
_namespace_map = {
59
    # "well-known" namespace prefixes
60
    "http://www.w3.org/XML/1998/namespace": "xml",
61
    "http://www.w3.org/1999/xhtml": "html",
62
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
63
    "http://schemas.xmlsoap.org/wsdl/": "wsdl",
64
    # xml schema
65
    "http://www.w3.org/2001/XMLSchema": "xs",
66
    "http://www.w3.org/2001/XMLSchema-instance": "xsi",
67
    # dublic core
68
    "http://purl.org/dc/elements/1.1/": "dc",
69
}
70
71
72
def _raise_serialization_error(text):
73
    raise TypeError(
74
        "cannot serialize %r (type %s)" % (text, type(text).__name__)
75
        )
76
77
def _encode(text, encoding):
78
    try:
79
        return text.encode(encoding, "xmlcharrefreplace")
80
    except (TypeError, AttributeError):
81
        _raise_serialization_error(text)
82
83
def _escape_cdata(text, encoding):
84
    # escape character data
85
    try:
86
        # it's worth avoiding do-nothing calls for strings that are
87
        # shorter than 500 character, or so.  assume that's, by far,
88
        # the most common case in most applications.
89
        if "&" in text:
90
            text = text.replace("&", "&")
91
        if "<" in text:
92
            text = text.replace("<", "&lt;")
93
        if ">" in text:
94
            text = text.replace(">", "&gt;")
95
        return text.encode(encoding, "xmlcharrefreplace")
96
    except (TypeError, AttributeError):
97
        _raise_serialization_error(text)
98
99
100
def _escape_attrib(text, encoding):
101
    # escape attribute value
102
    try:
103
        if "&" in text:
104
            text = text.replace("&", "&amp;")
105
        if "<" in text:
106
            text = text.replace("<", "&lt;")
107
        if ">" in text:
108
            text = text.replace(">", "&gt;")
109
        if "\"" in text:
110
            text = text.replace("\"", "&quot;")
111
        if "\n" in text:
112
            text = text.replace("\n", "&#10;")
113
        return text.encode(encoding, "xmlcharrefreplace")
114
    except (TypeError, AttributeError):
115
        _raise_serialization_error(text)
116
117
def _escape_attrib_html(text, encoding):
118
    # escape attribute value
119
    try:
120
        if "&" in text:
121
            text = text.replace("&", "&amp;")
122
        if ">" in text:
123
            text = text.replace(">", "&gt;")
124
        if "\"" in text:
125
            text = text.replace("\"", "&quot;")
126
        return text.encode(encoding, "xmlcharrefreplace")
127
    except (TypeError, AttributeError):
128
        _raise_serialization_error(text)
129
130
131
def _serialize_html(write, elem, encoding, qnames, namespaces):
132
    tag = elem.tag
133
    text = elem.text
134
    if tag is Comment:
135
        write("<!--%s-->" % _escape_cdata(text, encoding))
136
    elif tag is ProcessingInstruction:
137
        write("<?%s?>" % _escape_cdata(text, encoding))
138
    else:
139
        tag = qnames[tag]
140
        if tag is None:
141
            if text:
142
                write(_escape_cdata(text, encoding))
143
            for e in elem:
144
                _serialize_html(write, e, encoding, qnames, None)
145
        else:
146
            write("<" + tag)
147
            items = elem.items()
148
            if items or namespaces:
149
                items.sort() # lexical order
150
                for k, v in items:
151
                    if isinstance(k, QName):
152
                        k = k.text
153
                    if isinstance(v, QName):
154
                        v = qnames[v.text]
155
                    else:
156
                        v = _escape_attrib_html(v, encoding)
157
                    # FIXME: handle boolean attributes
158
                    write(" %s=\"%s\"" % (qnames[k], v))
159
                if namespaces:
160
                    items = namespaces.items()
161
                    items.sort(key=lambda x: x[1]) # sort on prefix
162
                    for v, k in items:
163
                        if k:
164
                            k = ":" + k
165
                        write(" xmlns%s=\"%s\"" % (
166
                            k.encode(encoding),
167
                            _escape_attrib(v, encoding)
168
                            ))
169
            write(">")
170
            tag = tag.lower()
171
            if text:
172
                if tag == "script" or tag == "style":
173
                    write(_encode(text, encoding))
174
                else:
175
                    write(_escape_cdata(text, encoding))
176
            for e in elem:
177
                _serialize_html(write, e, encoding, qnames, None)
178
            if tag not in HTML_EMPTY:
179
                write("</" + tag + ">")
180
    if elem.tail:
181
        write(_escape_cdata(elem.tail, encoding))
182
183
def write_html(root, f,
184
          # keyword arguments
185
          encoding="us-ascii",
186
          default_namespace=None):
187
    assert root is not None
188
    if not hasattr(f, "write"):
189
        f = open(f, "wb")
190
    write = f.write
191
    if not encoding:
192
        encoding = "us-ascii"
193
    qnames, namespaces = _namespaces(
194
            root, encoding, default_namespace
195
            )
196
    _serialize_html(
197
                write, root, encoding, qnames, namespaces
198
                )
199
200
# --------------------------------------------------------------------
201
# serialization support
202
203
def _namespaces(elem, encoding, default_namespace=None):
204
    # identify namespaces used in this tree
205
206
    # maps qnames to *encoded* prefix:local names
207
    qnames = {None: None}
208
209
    # maps uri:s to prefixes
210
    namespaces = {}
211
    if default_namespace:
212
        namespaces[default_namespace] = ""
213
214
    def encode(text):
215
        return text.encode(encoding)
216
217
    def add_qname(qname):
218
        # calculate serialized qname representation
219
        try:
220
            if qname[:1] == "{":
221
                uri, tag = qname[1:].split("}", 1)
222
                prefix = namespaces.get(uri)
223
                if prefix is None:
224
                    prefix = _namespace_map.get(uri)
225
                    if prefix is None:
226
                        prefix = "ns%d" % len(namespaces)
227
                    if prefix != "xml":
228
                        namespaces[uri] = prefix
229
                if prefix:
230
                    qnames[qname] = encode("%s:%s" % (prefix, tag))
231
                else:
232
                    qnames[qname] = encode(tag) # default element
233
            else:
234
                if default_namespace:
235
                    # FIXME: can this be handled in XML 1.0?
236
                    raise ValueError(
237
                        "cannot use non-qualified names with "
238
                        "default_namespace option"
239
                        )
240
                qnames[qname] = encode(qname)
241
        except TypeError:
242
            _raise_serialization_error(qname)
243
244
    # populate qname and namespaces table
245
    try:
246
        iterate = elem.iter
247
    except AttributeError:
248
        iterate = elem.getiterator # cET compatibility
249
    for elem in iterate():
250
        tag = elem.tag
251
        if isinstance(tag, QName) and tag.text not in qnames:
252
            add_qname(tag.text)
253
        elif isinstance(tag, basestring):
254
            if tag not in qnames:
255
                add_qname(tag)
256
        elif tag is not None and tag is not Comment and tag is not PI:
257
            _raise_serialization_error(tag)
258
        for key, value in elem.items():
259
            if isinstance(key, QName):
260
                key = key.text
261
            if key not in qnames:
262
                add_qname(key)
263
            if isinstance(value, QName) and value.text not in qnames:
264
                add_qname(value.text)
265
        text = elem.text
266
        if isinstance(text, QName) and text.text not in qnames:
267
            add_qname(text.text)
268
    return qnames, namespaces
269
270
def to_html_string(element, encoding=None):
271
    class dummy:
272
        pass
273
    data = []
274
    file = dummy()
275
    file.write = data.append
276
    write_html(ElementTree(element).getroot(),file,encoding)
277
    return "".join(data)