### HTML #############################################################################################
# Code for stripping tags and collapsing whitespace.
# Author: Tom De Smedt.
# Copyright (c) 2007 by Tom De Smedt.
# See LICENSE.txt for details.
import sgmllib
import re
from html.entities import name2codepoint
from .BeautifulSoup import UnicodeDammit
def clear_cache():
Cache("html").clear()
#### REPLACE ENTITIES ################################################################################
# Windows-1252 is a character encoding of the Latin alphabet,
# used by default in the legacy components of Microsoft Windows.
# List taken from Mark Pilgrim's feedparser.py
cp1252 = {
chr(128): chr(8364), # euro sign
chr(130): chr(8218), # single low-9 quotation mark
chr(131): chr( 402), # latin small letter f with hook
chr(132): chr(8222), # double low-9 quotation mark
chr(133): chr(8230), # horizontal ellipsis
chr(134): chr(8224), # dagger
chr(135): chr(8225), # double dagger
chr(136): chr( 710), # modifier letter circumflex accent
chr(137): chr(8240), # per mille sign
chr(138): chr( 352), # latin capital letter s with caron
chr(139): chr(8249), # single left-pointing angle quotation mark
chr(140): chr( 338), # latin capital ligature oe
chr(142): chr( 381), # latin capital letter z with caron
chr(145): chr(8216), # left single quotation mark
chr(146): chr(8217), # right single quotation mark
chr(147): chr(8220), # left double quotation mark
chr(148): chr(8221), # right double quotation mark
chr(149): chr(8226), # bullet
chr(150): chr(8211), # en dash
chr(151): chr(8212), # em dash
chr(152): chr( 732), # small tilde
chr(153): chr(8482), # trade mark sign
chr(154): chr( 353), # latin small letter s with caron
chr(155): chr(8250), # single right-pointing angle quotation mark
chr(156): chr( 339), # latin small ligature oe
chr(158): chr( 382), # latin small letter z with caron
chr(159): chr( 376) # latin capital letter y with diaeresis
}
def replace_entities(ustring, placeholder=" "):
"""Replaces HTML special characters by readable characters.
As taken from Leif K-Brooks algorithm on:
http://groups-beta.google.com/group/comp.lang.python
"""
def _repl_func(match):
try:
if match.group(1): # Numeric character reference
return chr( int(match.group(2)) )
else:
try: return cp1252[ chr(int(match.group(3))) ].strip()
except: return chr( name2codepoint[match.group(3)] )
except:
return placeholder
# Force to Unicode.
if not isinstance(ustring, str):
ustring = UnicodeDammit(ustring).str
# Don't want some weird unicode character here
# that truncate_spaces() doesn't know of:
ustring = ustring.replace(" ", " ")
# The ^> makes sure nothing inside a tag (i.e. href with query arguments) gets processed.
_entity_re = re.compile(r'&(?:(#)(\d+)|([^;^> ]+));')
return _entity_re.sub(_repl_func, ustring)
#### STRIP TAGS ######################################################################################
class Tagstripper(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
def strip(self, html, exclude=[], linebreaks=False, blocks="\n", breaks="\n", columns="\n"):
self.data = ""
self.exclude = exclude
self.linebreaks = linebreaks
self.block = blocks
self.blocks = [
"h1", "h2", "h3", "h4", "h5", "h6",
"p", "center", "blockquote",
"div", "table", "ul", "ol",
"pre", "code", "form"
]
self.break_ = breaks
self.breaks = [
"br", "tr", "li"
]
self.columns = columns
self.feed(self.prepare(html))
self.close()
return self.data
def prepare(self, html):
# Clean up faulty HTML before parsing.
html = html.replace("
", "
")
html = html.replace("