Skip to content

Commit 8d852cb

Browse files
committed
Implement strict and loose markdown parsing
This is enabled by default: - strict: only markdown syntax is parsed - loose: both markdown and html syntax are parsed
1 parent e4a6d16 commit 8d852cb

File tree

1 file changed

+43
-50
lines changed

1 file changed

+43
-50
lines changed
Lines changed: 43 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import html
2020
import re
21+
from typing import Union
2122

2223
import pyrogram
2324
from . import utils
@@ -30,42 +31,52 @@
3031
CODE_DELIM = "`"
3132
PRE_DELIM = "```"
3233

34+
MARKDOWN_RE = re.compile(r"({d})|\[(.+?)\]\((.+?)\)".format(
35+
d="|".join(
36+
["".join(i) for i in [
37+
[r"\{}".format(j) for j in i]
38+
for i in [
39+
PRE_DELIM,
40+
CODE_DELIM,
41+
STRIKE_DELIM,
42+
UNDERLINE_DELIM,
43+
ITALIC_DELIM,
44+
BOLD_DELIM
45+
]
46+
]]
47+
)))
48+
49+
OPENING_TAG = "<{}>"
50+
CLOSING_TAG = "</{}>"
51+
URL_MARKUP = '<a href="{}">{}</a>'
52+
FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]
53+
3354

3455
class Markdown:
35-
MARKDOWN_RE = re.compile(r"({d})".format(
36-
d="|".join(
37-
["".join(i) for i in [
38-
[r"\{}".format(j) for j in i]
39-
for i in [
40-
PRE_DELIM,
41-
CODE_DELIM,
42-
STRIKE_DELIM,
43-
UNDERLINE_DELIM,
44-
ITALIC_DELIM,
45-
BOLD_DELIM
46-
]
47-
]]
48-
)))
49-
50-
URL_RE = re.compile(r"\[([^[]+)]\(([^(]+)\)")
51-
52-
OPENING_TAG = "<{}>"
53-
CLOSING_TAG = "</{}>"
54-
URL_MARKUP = '<a href="{}">{}</a>'
55-
FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]
56-
57-
def __init__(self, client: "pyrogram.BaseClient"):
56+
def __init__(self, client: Union["pyrogram.BaseClient", None]):
5857
self.html = HTML(client)
5958

60-
def parse(self, text: str):
61-
text = html.escape(text)
59+
def parse(self, text: str, strict: bool = False):
60+
if strict:
61+
text = html.escape(text)
6262

63-
offset = 0
6463
delims = set()
64+
is_fixed_width = False
65+
66+
for i, match in enumerate(re.finditer(MARKDOWN_RE, text)):
67+
start, _ = match.span()
68+
delim, text_url, url = match.groups()
69+
full = match.group(0)
70+
71+
if delim in FIXED_WIDTH_DELIMS:
72+
is_fixed_width = not is_fixed_width
6573

66-
for i, match in enumerate(re.finditer(Markdown.MARKDOWN_RE, text)):
67-
start, stop = match.span()
68-
delim = match.group(1)
74+
if is_fixed_width and delim not in FIXED_WIDTH_DELIMS:
75+
continue
76+
77+
if text_url:
78+
text = utils.replace_once(text, full, URL_MARKUP.format(url, text_url), start)
79+
continue
6980

7081
if delim == BOLD_DELIM:
7182
tag = "b"
@@ -82,32 +93,14 @@ def parse(self, text: str):
8293
else:
8394
continue
8495

85-
if delim not in Markdown.FIXED_WIDTH_DELIMS and any(x in delims for x in Markdown.FIXED_WIDTH_DELIMS):
86-
continue
87-
8896
if delim not in delims:
8997
delims.add(delim)
90-
tag = Markdown.OPENING_TAG.format(tag)
98+
tag = OPENING_TAG.format(tag)
9199
else:
92100
delims.remove(delim)
93-
tag = Markdown.CLOSING_TAG.format(tag)
94-
95-
text = text[:start + offset] + tag + text[stop + offset:]
96-
97-
offset += len(tag) - len(delim)
98-
99-
offset = 0
100-
101-
for match in re.finditer(Markdown.URL_RE, text):
102-
start, stop = match.span()
103-
full = match.group(0)
104-
105-
body, url = match.groups()
106-
replace = Markdown.URL_MARKUP.format(url, body)
107-
108-
text = text[:start + offset] + replace + text[stop + offset:]
101+
tag = CLOSING_TAG.format(tag)
109102

110-
offset += len(replace) - len(full)
103+
text = utils.replace_once(text, delim, tag, start)
111104

112105
return self.html.parse(text)
113106

0 commit comments

Comments
 (0)