| 1 |
# markdown/html4.py |
| 2 |
# |
| 3 |
# Add html4 serialization to older versions of Elementree |
| 4 |
# Taken from ElementTree 1.3 preview with slight modifications |
| 5 |
# |
| 6 |
# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved. |
| 7 |
# |
| 8 |
# fredrik@pythonware.com |
| 9 |
# http://www.pythonware.com |
| 10 |
# |
| 11 |
# -------------------------------------------------------------------- |
| 12 |
# The ElementTree toolkit is |
| 13 |
# |
| 14 |
# Copyright (c) 1999-2007 by Fredrik Lundh |
| 15 |
# |
| 16 |
# By obtaining, using, and/or copying this software and/or its |
| 17 |
# associated documentation, you agree that you have read, understood, |
| 18 |
# and will comply with the following terms and conditions: |
| 19 |
# |
| 20 |
# Permission to use, copy, modify, and distribute this software and |
| 21 |
# its associated documentation for any purpose and without fee is |
| 22 |
# hereby granted, provided that the above copyright notice appears in |
| 23 |
# all copies, and that both that copyright notice and this permission |
| 24 |
# notice appear in supporting documentation, and that the name of |
| 25 |
# Secret Labs AB or the author not be used in advertising or publicity |
| 26 |
# pertaining to distribution of the software without specific, written |
| 27 |
# prior permission. |
| 28 |
# |
| 29 |
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD |
| 30 |
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- |
| 31 |
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR |
| 32 |
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY |
| 33 |
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
| 34 |
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS |
| 35 |
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE |
| 36 |
# OF THIS SOFTWARE. |
| 37 |
# -------------------------------------------------------------------- |
| 38 |
|
| 39 |
|
| 40 |
import util |
| 41 |
ElementTree = util.etree.ElementTree |
| 42 |
QName = util.etree.QName |
| 43 |
if hasattr(util.etree, 'test_comment'): |
| 44 |
Comment = util.etree.test_comment |
| 45 |
else: |
| 46 |
Comment = util.etree.Comment |
| 47 |
PI = util.etree.PI |
| 48 |
ProcessingInstruction = util.etree.ProcessingInstruction |
| 49 |
|
| 50 |
HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", |
| 51 |
"img", "input", "isindex", "link", "meta" "param") |
| 52 |
|
| 53 |
try: |
| 54 |
HTML_EMPTY = set(HTML_EMPTY) |
| 55 |
except NameError: |
| 56 |
pass |
| 57 |
|
| 58 |
_namespace_map = { |
| 59 |
# "well-known" namespace prefixes |
| 60 |
"http://www.w3.org/XML/1998/namespace": "xml", |
| 61 |
"http://www.w3.org/1999/xhtml": "html", |
| 62 |
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", |
| 63 |
"http://schemas.xmlsoap.org/wsdl/": "wsdl", |
| 64 |
# xml schema |
| 65 |
"http://www.w3.org/2001/XMLSchema": "xs", |
| 66 |
"http://www.w3.org/2001/XMLSchema-instance": "xsi", |
| 67 |
# dublic core |
| 68 |
"http://purl.org/dc/elements/1.1/": "dc", |
| 69 |
} |
| 70 |
|
| 71 |
|
| 72 |
def _raise_serialization_error(text): |
| 73 |
raise TypeError( |
| 74 |
"cannot serialize %r (type %s)" % (text, type(text).__name__) |
| 75 |
) |
| 76 |
|
| 77 |
def _encode(text, encoding): |
| 78 |
try: |
| 79 |
return text.encode(encoding, "xmlcharrefreplace") |
| 80 |
except (TypeError, AttributeError): |
| 81 |
_raise_serialization_error(text) |
| 82 |
|
| 83 |
def _escape_cdata(text, encoding): |
| 84 |
# escape character data |
| 85 |
try: |
| 86 |
# it's worth avoiding do-nothing calls for strings that are |
| 87 |
# shorter than 500 character, or so. assume that's, by far, |
| 88 |
# the most common case in most applications. |
| 89 |
if "&" in text: |
| 90 |
text = text.replace("&", "&") |
| 91 |
if "<" in text: |
| 92 |
text = text.replace("<", "<") |
| 93 |
if ">" in text: |
| 94 |
text = text.replace(">", ">") |
| 95 |
return text.encode(encoding, "xmlcharrefreplace") |
| 96 |
except (TypeError, AttributeError): |
| 97 |
_raise_serialization_error(text) |
| 98 |
|
| 99 |
|
| 100 |
def _escape_attrib(text, encoding): |
| 101 |
# escape attribute value |
| 102 |
try: |
| 103 |
if "&" in text: |
| 104 |
text = text.replace("&", "&") |
| 105 |
if "<" in text: |
| 106 |
text = text.replace("<", "<") |
| 107 |
if ">" in text: |
| 108 |
text = text.replace(">", ">") |
| 109 |
if "\"" in text: |
| 110 |
text = text.replace("\"", """) |
| 111 |
if "\n" in text: |
| 112 |
text = text.replace("\n", " ") |
| 113 |
return text.encode(encoding, "xmlcharrefreplace") |
| 114 |
except (TypeError, AttributeError): |
| 115 |
_raise_serialization_error(text) |
| 116 |
|
| 117 |
def _escape_attrib_html(text, encoding): |
| 118 |
# escape attribute value |
| 119 |
try: |
| 120 |
if "&" in text: |
| 121 |
text = text.replace("&", "&") |
| 122 |
if ">" in text: |
| 123 |
text = text.replace(">", ">") |
| 124 |
if "\"" in text: |
| 125 |
text = text.replace("\"", """) |
| 126 |
return text.encode(encoding, "xmlcharrefreplace") |
| 127 |
except (TypeError, AttributeError): |
| 128 |
_raise_serialization_error(text) |
| 129 |
|
| 130 |
|
| 131 |
def _serialize_html(write, elem, encoding, qnames, namespaces): |
| 132 |
tag = elem.tag |
| 133 |
text = elem.text |
| 134 |
if tag is Comment: |
| 135 |
write("<!--%s-->" % _escape_cdata(text, encoding)) |
| 136 |
elif tag is ProcessingInstruction: |
| 137 |
write("<?%s?>" % _escape_cdata(text, encoding)) |
| 138 |
else: |
| 139 |
tag = qnames[tag] |
| 140 |
if tag is None: |
| 141 |
if text: |
| 142 |
write(_escape_cdata(text, encoding)) |
| 143 |
for e in elem: |
| 144 |
_serialize_html(write, e, encoding, qnames, None) |
| 145 |
else: |
| 146 |
write("<" + tag) |
| 147 |
items = elem.items() |
| 148 |
if items or namespaces: |
| 149 |
items.sort() # lexical order |
| 150 |
for k, v in items: |
| 151 |
if isinstance(k, QName): |
| 152 |
k = k.text |
| 153 |
if isinstance(v, QName): |
| 154 |
v = qnames[v.text] |
| 155 |
else: |
| 156 |
v = _escape_attrib_html(v, encoding) |
| 157 |
# FIXME: handle boolean attributes |
| 158 |
write(" %s=\"%s\"" % (qnames[k], v)) |
| 159 |
if namespaces: |
| 160 |
items = namespaces.items() |
| 161 |
items.sort(key=lambda x: x[1]) # sort on prefix |
| 162 |
for v, k in items: |
| 163 |
if k: |
| 164 |
k = ":" + k |
| 165 |
write(" xmlns%s=\"%s\"" % ( |
| 166 |
k.encode(encoding), |
| 167 |
_escape_attrib(v, encoding) |
| 168 |
)) |
| 169 |
write(">") |
| 170 |
tag = tag.lower() |
| 171 |
if text: |
| 172 |
if tag == "script" or tag == "style": |
| 173 |
write(_encode(text, encoding)) |
| 174 |
else: |
| 175 |
write(_escape_cdata(text, encoding)) |
| 176 |
for e in elem: |
| 177 |
_serialize_html(write, e, encoding, qnames, None) |
| 178 |
if tag not in HTML_EMPTY: |
| 179 |
write("</" + tag + ">") |
| 180 |
if elem.tail: |
| 181 |
write(_escape_cdata(elem.tail, encoding)) |
| 182 |
|
| 183 |
def write_html(root, f, |
| 184 |
# keyword arguments |
| 185 |
encoding="us-ascii", |
| 186 |
default_namespace=None): |
| 187 |
assert root is not None |
| 188 |
if not hasattr(f, "write"): |
| 189 |
f = open(f, "wb") |
| 190 |
write = f.write |
| 191 |
if not encoding: |
| 192 |
encoding = "us-ascii" |
| 193 |
qnames, namespaces = _namespaces( |
| 194 |
root, encoding, default_namespace |
| 195 |
) |
| 196 |
_serialize_html( |
| 197 |
write, root, encoding, qnames, namespaces |
| 198 |
) |
| 199 |
|
| 200 |
# -------------------------------------------------------------------- |
| 201 |
# serialization support |
| 202 |
|
| 203 |
def _namespaces(elem, encoding, default_namespace=None): |
| 204 |
# identify namespaces used in this tree |
| 205 |
|
| 206 |
# maps qnames to *encoded* prefix:local names |
| 207 |
qnames = {None: None} |
| 208 |
|
| 209 |
# maps uri:s to prefixes |
| 210 |
namespaces = {} |
| 211 |
if default_namespace: |
| 212 |
namespaces[default_namespace] = "" |
| 213 |
|
| 214 |
def encode(text): |
| 215 |
return text.encode(encoding) |
| 216 |
|
| 217 |
def add_qname(qname): |
| 218 |
# calculate serialized qname representation |
| 219 |
try: |
| 220 |
if qname[:1] == "{": |
| 221 |
uri, tag = qname[1:].split("}", 1) |
| 222 |
prefix = namespaces.get(uri) |
| 223 |
if prefix is None: |
| 224 |
prefix = _namespace_map.get(uri) |
| 225 |
if prefix is None: |
| 226 |
prefix = "ns%d" % len(namespaces) |
| 227 |
if prefix != "xml": |
| 228 |
namespaces[uri] = prefix |
| 229 |
if prefix: |
| 230 |
qnames[qname] = encode("%s:%s" % (prefix, tag)) |
| 231 |
else: |
| 232 |
qnames[qname] = encode(tag) # default element |
| 233 |
else: |
| 234 |
if default_namespace: |
| 235 |
# FIXME: can this be handled in XML 1.0? |
| 236 |
raise ValueError( |
| 237 |
"cannot use non-qualified names with " |
| 238 |
"default_namespace option" |
| 239 |
) |
| 240 |
qnames[qname] = encode(qname) |
| 241 |
except TypeError: |
| 242 |
_raise_serialization_error(qname) |
| 243 |
|
| 244 |
# populate qname and namespaces table |
| 245 |
try: |
| 246 |
iterate = elem.iter |
| 247 |
except AttributeError: |
| 248 |
iterate = elem.getiterator # cET compatibility |
| 249 |
for elem in iterate(): |
| 250 |
tag = elem.tag |
| 251 |
if isinstance(tag, QName) and tag.text not in qnames: |
| 252 |
add_qname(tag.text) |
| 253 |
elif isinstance(tag, basestring): |
| 254 |
if tag not in qnames: |
| 255 |
add_qname(tag) |
| 256 |
elif tag is not None and tag is not Comment and tag is not PI: |
| 257 |
_raise_serialization_error(tag) |
| 258 |
for key, value in elem.items(): |
| 259 |
if isinstance(key, QName): |
| 260 |
key = key.text |
| 261 |
if key not in qnames: |
| 262 |
add_qname(key) |
| 263 |
if isinstance(value, QName) and value.text not in qnames: |
| 264 |
add_qname(value.text) |
| 265 |
text = elem.text |
| 266 |
if isinstance(text, QName) and text.text not in qnames: |
| 267 |
add_qname(text.text) |
| 268 |
return qnames, namespaces |
| 269 |
|
| 270 |
def to_html_string(element, encoding=None): |
| 271 |
class dummy: |
| 272 |
pass |
| 273 |
data = [] |
| 274 |
file = dummy() |
| 275 |
file.write = data.append |
| 276 |
write_html(ElementTree(element).getroot(),file,encoding) |
| 277 |
return "".join(data) |