forked from core-api/python-client
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml.py
More file actions
23 lines (20 loc) · 775 Bytes
/
html.py
File metadata and controls
23 lines (20 loc) · 775 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# coding: utf-8
from coreapi.codecs.base import BaseCodec
import html2text
import re
class HTMLCodec(BaseCodec):
media_type = 'text/html'
def load(self, bytes, base_url=None):
content = bytes.decode('utf-8')
# HTML to text.
converter = html2text.HTML2Text()
converter.ignore_links = True
converter.ignore_images = True
content = converter.handle(content).strip()
# Strip leading/trailing whitespace in lines.
content = '\n'.join([line.strip() for line in content.splitlines()])
# Remove multiple newlines
content = re.sub(r'\n\n\n+', '\n\n', content)
# Remove generated markdown headers
content = re.sub('^#+ ', '', content, flags=re.MULTILINE)
return content