Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os, os.path, sys 
  9   
 10  this_dir = os.path.dirname(__file__) 
 11  if this_dir not in sys.path: 
 12      sys.path.insert(0, this_dir) # needed for Py3 
 13   
 14  from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str 
 15  from common_imports import SillyFileLike, HelperTestCase, write_to_file, next 
 16   
 17  try: 
 18      unicode 
 19  except NameError: 
 20      unicode = str 
 21   
22 -class HtmlParserTestCase(HelperTestCase):
23 """HTML parser test cases 24 """ 25 etree = etree 26 27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>") 28 html_str_pretty = _bytes("""\ 29 <html> 30 <head><title>test</title></head> 31 <body><h1>page title</h1></body> 32 </html> 33 """) 34 broken_html_str = _bytes("<html><head><title>test<body><h1>page title</h3></p></html>") 35 uhtml_str = _str("<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>") 36
37 - def tearDown(self):
38 super(HtmlParserTestCase, self).tearDown() 39 self.etree.set_default_parser()
40
41 - def test_module_HTML(self):
42 element = self.etree.HTML(self.html_str) 43 self.assertEqual(self.etree.tostring(element, method="html"), 44 self.html_str)
45
46 - def test_module_HTML_unicode(self):
47 element = self.etree.HTML(self.uhtml_str) 48 self.assertEqual(unicode(self.etree.tostring(element, method="html", 49 encoding='UTF8'), 'UTF8'), 50 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
51
53 element = self.etree.HTML(self.html_str) 54 self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True), 55 self.html_str_pretty)
56
58 parser = self.etree.HTMLParser(recover=False) 59 parse = self.etree.parse 60 f = BytesIO("<html></body>") 61 self.assertRaises(self.etree.XMLSyntaxError, 62 parse, f, parser)
63
65 parser = self.etree.HTMLParser() 66 Element = parser.makeelement 67 68 el = Element('name') 69 self.assertRaises(ValueError, Element, '{}') 70 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 71 72 self.assertRaises(ValueError, Element, '{test}') 73 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
74
76 parser = self.etree.HTMLParser() 77 Element = parser.makeelement 78 79 pname = Element('p:name') 80 self.assertEqual(pname.tag, 'p:name') 81 82 pname = Element('{test}p:name') 83 self.assertEqual(pname.tag, '{test}p:name') 84 85 pname = Element('name') 86 pname.tag = 'p:name' 87 self.assertEqual(pname.tag, 'p:name')
88
90 parser = self.etree.HTMLParser() 91 Element = parser.makeelement 92 93 self.assertRaises(ValueError, Element, 'p"name') 94 self.assertRaises(ValueError, Element, "na'me") 95 self.assertRaises(ValueError, Element, '{test}"name') 96 self.assertRaises(ValueError, Element, "{test}name'") 97 98 el = Element('name') 99 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 100 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 101 self.assertEqual(el.tag, "name")
102
104 parser = self.etree.HTMLParser() 105 Element = parser.makeelement 106 107 self.assertRaises(ValueError, Element, ' name ') 108 self.assertRaises(ValueError, Element, 'na me') 109 self.assertRaises(ValueError, Element, '{test} name') 110 111 el = Element('name') 112 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 113 self.assertEqual(el.tag, "name")
114
116 parser = self.etree.HTMLParser() 117 Element = parser.makeelement 118 119 SubElement = self.etree.SubElement 120 121 el = Element('name') 122 self.assertRaises(ValueError, SubElement, el, '{}') 123 self.assertRaises(ValueError, SubElement, el, '{test}')
124
126 parser = self.etree.HTMLParser() 127 Element = parser.makeelement 128 SubElement = self.etree.SubElement 129 130 el = Element('name') 131 pname = SubElement(el, 'p:name') 132 self.assertEqual(pname.tag, 'p:name') 133 134 pname = SubElement(el, '{test}p:name') 135 self.assertEqual(pname.tag, '{test}p:name')
136
138 parser = self.etree.HTMLParser() 139 Element = parser.makeelement 140 SubElement = self.etree.SubElement 141 142 el = Element('name') 143 self.assertRaises(ValueError, SubElement, el, "name'") 144 self.assertRaises(ValueError, SubElement, el, 'na"me') 145 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 146 self.assertRaises(ValueError, SubElement, el, '{test}"name')
147
149 parser = self.etree.HTMLParser() 150 Element = parser.makeelement 151 SubElement = self.etree.SubElement 152 153 el = Element('name') 154 self.assertRaises(ValueError, SubElement, el, ' name ') 155 self.assertRaises(ValueError, SubElement, el, 'na me') 156 self.assertRaises(ValueError, SubElement, el, '{test} name')
157
159 parser = self.etree.HTMLParser(recover=False) 160 parse = self.etree.parse 161 f = BytesIO(self.broken_html_str) 162 self.assertRaises(self.etree.XMLSyntaxError, 163 parse, f, parser)
164
166 text = _str('Søk på nettet') 167 html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1') 168 169 tree = self.etree.parse( 170 BytesIO(html_latin1), 171 self.etree.HTMLParser(encoding="iso-8859-1")) 172 p = tree.find("//p") 173 self.assertEqual(p.text, text)
174
176 text = _str('Søk på nettet') 177 wrong_head = _str(''' 178 <head> 179 <meta http-equiv="Content-Type" 180 content="text/html; charset=UTF-8" /> 181 </head>''') 182 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head, 183 text) 184 ).encode('iso-8859-1') 185 186 self.assertRaises(self.etree.ParseError, 187 self.etree.parse, 188 BytesIO(html_latin1)) 189 190 tree = self.etree.parse( 191 BytesIO(html_latin1), 192 self.etree.HTMLParser(encoding="iso-8859-1")) 193 p = tree.find("//p") 194 self.assertEqual(p.text, text)
195
196 - def test_module_HTML_broken(self):
197 element = self.etree.HTML(self.broken_html_str) 198 self.assertEqual(self.etree.tostring(element, method="html"), 199 self.html_str)
200
201 - def test_module_HTML_cdata(self):
202 # by default, libxml2 generates CDATA nodes for <script> content 203 html = _bytes('<html><head><style>foo</style></head></html>') 204 element = self.etree.HTML(html) 205 self.assertEqual(element[0][0].text, "foo")
206
207 - def test_module_HTML_access(self):
208 element = self.etree.HTML(self.html_str) 209 self.assertEqual(element[0][0].tag, 'title')
210
211 - def test_module_parse_html(self):
212 parser = self.etree.HTMLParser() 213 filename = tempfile.mktemp(suffix=".html") 214 write_to_file(filename, self.html_str, 'wb') 215 try: 216 f = open(filename, 'rb') 217 tree = self.etree.parse(f, parser) 218 f.close() 219 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 220 self.html_str) 221 finally: 222 os.remove(filename)
223
225 parser = self.etree.HTMLParser() 226 f = SillyFileLike(self.html_str) 227 tree = self.etree.parse(f, parser) 228 html = self.etree.tostring(tree.getroot(), 229 method="html", encoding='UTF-8') 230 self.assertEqual(html, self.html_str)
231 232 ## def test_module_parse_html_filelike_unicode(self): 233 ## parser = self.etree.HTMLParser() 234 ## f = SillyFileLike(self.uhtml_str) 235 ## tree = self.etree.parse(f, parser) 236 ## html = self.etree.tostring(tree.getroot(), encoding='UTF-8') 237 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 238
239 - def test_html_file_error(self):
240 parser = self.etree.HTMLParser() 241 parse = self.etree.parse 242 self.assertRaises(IOError, 243 parse, "__some_hopefully_nonexisting_file__.html", 244 parser)
245
247 self.assertRaises(self.etree.XMLSyntaxError, 248 self.etree.parse, BytesIO(self.broken_html_str)) 249 250 self.etree.set_default_parser( self.etree.HTMLParser() ) 251 252 tree = self.etree.parse(BytesIO(self.broken_html_str)) 253 self.assertEqual(self.etree.tostring(tree.getroot(), method="html"), 254 self.html_str) 255 256 self.etree.set_default_parser() 257 258 self.assertRaises(self.etree.XMLSyntaxError, 259 self.etree.parse, BytesIO(self.broken_html_str))
260
261 - def test_html_iterparse(self):
262 iterparse = self.etree.iterparse 263 f = BytesIO( 264 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 265 266 iterator = iterparse(f, html=True) 267 self.assertEqual(None, iterator.root) 268 269 events = list(iterator) 270 root = iterator.root 271 self.assertTrue(root is not None) 272 self.assertEqual( 273 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 274 ('end', root[1]), ('end', root)], 275 events)
276
278 iterparse = self.etree.iterparse 279 f = BytesIO( 280 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 281 282 iterator = iterparse(f, html=True) 283 self.assertEqual(None, iterator.root) 284 285 event, element = next(iterator) 286 self.assertEqual('end', event) 287 self.assertEqual('title', element.tag) 288 self.assertEqual(None, iterator.root) 289 del element 290 291 event, element = next(iterator) 292 self.assertEqual('end', event) 293 self.assertEqual('head', element.tag) 294 self.assertEqual(None, iterator.root) 295 del element 296 del iterator
297
299 iterparse = self.etree.iterparse 300 f = BytesIO('<head><title>TEST></head><p>P<br></div>') 301 302 iterator = iterparse(f, html=True) 303 self.assertEqual(None, iterator.root) 304 305 events = list(iterator) 306 root = iterator.root 307 self.assertTrue(root is not None) 308 self.assertEqual('html', root.tag) 309 self.assertEqual('head', root[0].tag) 310 self.assertEqual('body', root[1].tag) 311 self.assertEqual('p', root[1][0].tag) 312 self.assertEqual('br', root[1][0][0].tag) 313 self.assertEqual( 314 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]), 315 ('end', root[1][0]), ('end', root[1]), ('end', root)], 316 events)
317
319 iterparse = self.etree.iterparse 320 f = BytesIO('<p>P<br></div>') 321 iterator = iterparse(f, html=True, recover=False) 322 self.assertRaises(self.etree.XMLSyntaxError, list, iterator)
323
324 - def test_html_iterparse_file(self):
325 iterparse = self.etree.iterparse 326 iterator = iterparse(fileInTestDir("shakespeare.html"), 327 html=True) 328 329 self.assertEqual(None, iterator.root) 330 events = list(iterator) 331 root = iterator.root 332 self.assertTrue(root is not None) 333 self.assertEqual(249, len(events)) 334 self.assertEqual( 335 [], 336 [ event for (event, element) in events if event != 'end' ])
337
339 iterparse = self.etree.iterparse 340 f = BytesIO( 341 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 342 343 iterator = iterparse(f, html=True, events=('start',)) 344 self.assertEqual(None, iterator.root) 345 346 events = list(iterator) 347 root = iterator.root 348 self.assertTrue(root is not None) 349 self.assertEqual( 350 [('start', root), ('start', root[0]), ('start', root[0][0]), 351 ('start', root[1]), ('start', root[1][0])], 352 events)
353
355 assertFalse = self.assertFalse 356 events = [] 357 class Target(object): 358 def start(self, tag, attrib): 359 events.append(("start", tag)) 360 assertFalse(attrib)
361 def end(self, tag): 362 events.append(("end", tag))
363 def close(self): 364 return "DONE" 365 366 parser = self.etree.HTMLParser(target=Target()) 367 368 parser.feed("<html><body></body></html>") 369 done = parser.close() 370 371 self.assertEqual("DONE", done) 372 self.assertEqual([ 373 ("start", "html"), ("start", "body"), 374 ("end", "body"), ("end", "html")], events) 375
376 - def test_html_parser_target_doctype_empty(self):
377 assertFalse = self.assertFalse 378 events = [] 379 class Target(object): 380 def start(self, tag, attrib): 381 events.append(("start", tag)) 382 assertFalse(attrib)
383 def end(self, tag): 384 events.append(("end", tag)) 385 def doctype(self, *args): 386 events.append(("doctype", args)) 387 def close(self): 388 return "DONE" 389 390 parser = self.etree.HTMLParser(target=Target()) 391 parser.feed("<!DOCTYPE><html><body></body></html>") 392 done = parser.close() 393 394 self.assertEqual("DONE", done) 395 self.assertEqual([ 396 ("doctype", (None, None, None)), 397 ("start", "html"), ("start", "body"), 398 ("end", "body"), ("end", "html")], events) 399
400 - def test_html_parser_target_doctype_html(self):
401 assertFalse = self.assertFalse 402 events = [] 403 class Target(object): 404 def start(self, tag, attrib): 405 events.append(("start", tag)) 406 assertFalse(attrib)
407 def end(self, tag): 408 events.append(("end", tag)) 409 def doctype(self, *args): 410 events.append(("doctype", args)) 411 def close(self): 412 return "DONE" 413 414 parser = self.etree.HTMLParser(target=Target()) 415 parser.feed("<!DOCTYPE html><html><body></body></html>") 416 done = parser.close() 417 418 self.assertEqual("DONE", done) 419 self.assertEqual([ 420 ("doctype", ("html", None, None)), 421 ("start", "html"), ("start", "body"), 422 ("end", "body"), ("end", "html")], events) 423
424 - def test_html_parser_target_doctype_html_full(self):
425 assertFalse = self.assertFalse 426 events = [] 427 class Target(object): 428 def start(self, tag, attrib): 429 events.append(("start", tag)) 430 assertFalse(attrib)
431 def end(self, tag): 432 events.append(("end", tag)) 433 def doctype(self, *args): 434 events.append(("doctype", args)) 435 def close(self): 436 return "DONE" 437 438 parser = self.etree.HTMLParser(target=Target()) 439 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">' 440 '<html><body></body></html>') 441 done = parser.close() 442 443 self.assertEqual("DONE", done) 444 self.assertEqual([ 445 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")), 446 ("start", "html"), ("start", "body"), 447 ("end", "body"), ("end", "html")], events) 448 449
450 -def test_suite():
451 suite = unittest.TestSuite() 452 suite.addTests([unittest.makeSuite(HtmlParserTestCase)]) 453 return suite
454 455 if __name__ == '__main__': 456 print('to test use test.py %s' % __file__) 457