499 أسطر
16 KiB
Python
499 أسطر
16 KiB
Python
import re
|
|
from typing import Optional, List, Tuple, Match, Pattern
|
|
import string
|
|
from .util import (
|
|
unikey,
|
|
escape_url,
|
|
expand_tab,
|
|
expand_leading_tab,
|
|
)
|
|
from .core import Parser, BlockState
|
|
from .helpers import (
|
|
LINK_LABEL,
|
|
HTML_TAGNAME,
|
|
HTML_ATTRIBUTES,
|
|
BLOCK_TAGS,
|
|
PRE_TAGS,
|
|
unescape_char,
|
|
parse_link_href,
|
|
parse_link_title,
|
|
)
|
|
from .list_parser import parse_list, LIST_PATTERN
|
|
|
|
_INDENT_CODE_TRIM = re.compile(r"^ {1,4}", flags=re.M)
|
|
_ATX_HEADING_TRIM = re.compile(r"(\s+|^)#+\s*$")
|
|
_BLOCK_QUOTE_TRIM = re.compile(r"^ ?", flags=re.M)
|
|
_BLOCK_QUOTE_LEADING = re.compile(r"^ *>", flags=re.M)
|
|
|
|
_LINE_BLANK_END = re.compile(r"\n[ \t]*\n$")
|
|
_BLANK_TO_LINE = re.compile(r"[ \t]*\n")
|
|
|
|
_BLOCK_TAGS_PATTERN = "(" + "|".join(BLOCK_TAGS) + "|" + "|".join(PRE_TAGS) + ")"
|
|
_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r"[ \t]*>[ \t]*(?:\n|$)")
|
|
_CLOSE_TAG_END = re.compile(r"[ \t]*>[ \t]*(?:\n|$)")
|
|
_STRICT_BLOCK_QUOTE = re.compile(r"( {0,3}>[^\n]*(?:\n|$))+")
|
|
|
|
|
|
class BlockParser(Parser[BlockState]):
|
|
state_cls = BlockState
|
|
|
|
BLANK_LINE = re.compile(r"(^[ \t\v\f]*\n)+", re.M)
|
|
|
|
RAW_HTML = (
|
|
r"^ {0,3}("
|
|
r"</?" + HTML_TAGNAME + r"|"
|
|
r"<!--|" # comment
|
|
r"<\?|" # script
|
|
r"<![A-Z]|"
|
|
r"<!\[CDATA\[)"
|
|
)
|
|
|
|
BLOCK_HTML = (
|
|
r"^ {0,3}(?:"
|
|
r"(?:</?" + _BLOCK_TAGS_PATTERN + r"(?:[ \t]+|\n|$))"
|
|
r"|<!--" # comment
|
|
r"|<\?" # script
|
|
r"|<![A-Z]"
|
|
r"|<!\[CDATA\[)"
|
|
)
|
|
|
|
SPECIFICATION = {
|
|
"blank_line": r"(^[ \t\v\f]*\n)+",
|
|
"atx_heading": r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*|[ \t]+.*?)$",
|
|
"setex_heading": r"^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$",
|
|
"fenced_code": (
|
|
r"^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})"
|
|
r"[ \t]*(?P<fenced_3>.*?)$"
|
|
),
|
|
"indent_code": (
|
|
r"^(?: {4}| *\t)[^\n]+(?:\n+|$)"
|
|
r"((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*"
|
|
),
|
|
"thematic_break": r"^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$",
|
|
"ref_link": r"^ {0,3}\[(?P<reflink_1>" + LINK_LABEL + r")\]:",
|
|
"block_quote": r"^ {0,3}>(?P<quote_1>.*?)$",
|
|
"list": LIST_PATTERN,
|
|
"block_html": BLOCK_HTML,
|
|
"raw_html": RAW_HTML,
|
|
}
|
|
|
|
DEFAULT_RULES = (
|
|
"fenced_code",
|
|
"indent_code",
|
|
"atx_heading",
|
|
"setex_heading",
|
|
"thematic_break",
|
|
"block_quote",
|
|
"list",
|
|
"ref_link",
|
|
"raw_html",
|
|
"blank_line",
|
|
)
|
|
|
|
def __init__(
|
|
self,
|
|
block_quote_rules: Optional[List[str]] = None,
|
|
list_rules: Optional[List[str]] = None,
|
|
max_nested_level: int = 6,
|
|
):
|
|
super(BlockParser, self).__init__()
|
|
|
|
if block_quote_rules is None:
|
|
block_quote_rules = list(self.DEFAULT_RULES)
|
|
|
|
if list_rules is None:
|
|
list_rules = list(self.DEFAULT_RULES)
|
|
|
|
self.block_quote_rules = block_quote_rules
|
|
self.list_rules = list_rules
|
|
self.max_nested_level = max_nested_level
|
|
# register default parse methods
|
|
self._methods = {name: getattr(self, "parse_" + name) for name in self.SPECIFICATION}
|
|
|
|
def parse_blank_line(self, m: Match[str], state: BlockState) -> int:
|
|
"""Parse token for blank lines."""
|
|
state.append_token({"type": "blank_line"})
|
|
return m.end()
|
|
|
|
def parse_thematic_break(self, m: Match[str], state: BlockState) -> int:
|
|
"""Parse token for thematic break, e.g. ``<hr>`` tag in HTML."""
|
|
state.append_token({"type": "thematic_break"})
|
|
# $ does not count '\n'
|
|
return m.end() + 1
|
|
|
|
def parse_indent_code(self, m: Match[str], state: BlockState) -> int:
|
|
"""Parse token for code block which is indented by 4 spaces."""
|
|
# it is a part of the paragraph
|
|
end_pos = state.append_paragraph()
|
|
if end_pos:
|
|
return end_pos
|
|
|
|
code = m.group(0)
|
|
code = expand_leading_tab(code)
|
|
code = _INDENT_CODE_TRIM.sub("", code)
|
|
code = code.strip("\n")
|
|
state.append_token({"type": "block_code", "raw": code, "style": "indent"})
|
|
return m.end()
|
|
|
|
def parse_fenced_code(self, m: Match[str], state: BlockState) -> Optional[int]:
|
|
"""Parse token for fenced code block. A fenced code block is started with
|
|
3 or more backtick(`) or tilde(~).
|
|
|
|
An example of a fenced code block:
|
|
|
|
.. code-block:: markdown
|
|
|
|
```python
|
|
def markdown(text):
|
|
return mistune.html(text)
|
|
```
|
|
"""
|
|
spaces = m.group("fenced_1")
|
|
marker = m.group("fenced_2")
|
|
info = m.group("fenced_3")
|
|
|
|
c = marker[0]
|
|
if info and c == "`":
|
|
# CommonMark Example 145
|
|
# Info strings for backtick code blocks cannot contain backticks
|
|
if info.find(c) != -1:
|
|
return None
|
|
|
|
_end = re.compile(r"^ {0,3}" + c + "{" + str(len(marker)) + r",}[ \t]*(?:\n|$)", re.M)
|
|
cursor_start = m.end() + 1
|
|
|
|
m2 = _end.search(state.src, cursor_start)
|
|
if m2:
|
|
code = state.src[cursor_start : m2.start()]
|
|
end_pos = m2.end()
|
|
else:
|
|
code = state.src[cursor_start:]
|
|
end_pos = state.cursor_max
|
|
|
|
if spaces and code:
|
|
_trim_pattern = re.compile("^ {0," + str(len(spaces)) + "}", re.M)
|
|
code = _trim_pattern.sub("", code)
|
|
|
|
token = {"type": "block_code", "raw": code, "style": "fenced", "marker": marker}
|
|
if info:
|
|
info = unescape_char(info)
|
|
token["attrs"] = {"info": info.strip()}
|
|
|
|
state.append_token(token)
|
|
return end_pos
|
|
|
|
def parse_atx_heading(self, m: Match[str], state: BlockState) -> int:
|
|
"""Parse token for ATX heading. An ATX heading is started with 1 to 6
|
|
symbol of ``#``."""
|
|
level = len(m.group("atx_1"))
|
|
text = m.group("atx_2").strip(string.whitespace)
|
|
# remove last #
|
|
if text:
|
|
text = _ATX_HEADING_TRIM.sub("", text)
|
|
|
|
token = {"type": "heading", "text": text, "attrs": {"level": level}, "style": "atx"}
|
|
state.append_token(token)
|
|
return m.end() + 1
|
|
|
|
def parse_setex_heading(self, m: Match[str], state: BlockState) -> Optional[int]:
|
|
"""Parse token for setex style heading. A setex heading syntax looks like:
|
|
|
|
.. code-block:: markdown
|
|
|
|
H1 title
|
|
========
|
|
"""
|
|
last_token = state.last_token()
|
|
if last_token and last_token["type"] == "paragraph":
|
|
level = 1 if m.group("setext_1") == "=" else 2
|
|
last_token["type"] = "heading"
|
|
last_token["style"] = "setext"
|
|
last_token["attrs"] = {"level": level}
|
|
return m.end() + 1
|
|
|
|
sc = self.compile_sc(["thematic_break", "list"])
|
|
m2 = sc.match(state.src, state.cursor)
|
|
if m2:
|
|
return self.parse_method(m2, state)
|
|
return None
|
|
|
|
def parse_ref_link(self, m: Match[str], state: BlockState) -> Optional[int]:
|
|
"""Parse link references and save the link information into ``state.env``.
|
|
|
|
Here is an example of a link reference:
|
|
|
|
.. code-block:: markdown
|
|
|
|
a [link][example]
|
|
|
|
[example]: https://example.com "Optional title"
|
|
|
|
This method will save the link reference into ``state.env`` as::
|
|
|
|
state.env['ref_links']['example'] = {
|
|
'url': 'https://example.com',
|
|
'title': "Optional title",
|
|
}
|
|
"""
|
|
end_pos = state.append_paragraph()
|
|
if end_pos:
|
|
return end_pos
|
|
|
|
label = m.group("reflink_1")
|
|
key = unikey(label)
|
|
if not key:
|
|
return None
|
|
|
|
href, href_pos = parse_link_href(state.src, m.end(), block=True)
|
|
if href is None:
|
|
return None
|
|
|
|
assert href_pos is not None
|
|
|
|
_blank = self.BLANK_LINE.search(state.src, href_pos)
|
|
if _blank:
|
|
max_pos = _blank.start()
|
|
else:
|
|
max_pos = state.cursor_max
|
|
|
|
title, title_pos = parse_link_title(state.src, href_pos, max_pos)
|
|
if title_pos:
|
|
m2 = _BLANK_TO_LINE.match(state.src, title_pos)
|
|
if m2:
|
|
title_pos = m2.end()
|
|
else:
|
|
title_pos = None
|
|
title = None
|
|
|
|
if title_pos is None:
|
|
m3 = _BLANK_TO_LINE.match(state.src, href_pos)
|
|
if m3:
|
|
href_pos = m3.end()
|
|
else:
|
|
href_pos = None
|
|
href = None
|
|
|
|
end_pos = title_pos or href_pos
|
|
if not end_pos:
|
|
return None
|
|
|
|
if key not in state.env["ref_links"]:
|
|
assert href is not None
|
|
href = unescape_char(href)
|
|
data = {"url": escape_url(href), "label": label}
|
|
if title:
|
|
data["title"] = title
|
|
state.env["ref_links"][key] = data
|
|
return end_pos
|
|
|
|
def extract_block_quote(self, m: Match[str], state: BlockState) -> Tuple[str, Optional[int]]:
|
|
"""Extract text and cursor end position of a block quote."""
|
|
|
|
# cleanup at first to detect if it is code block
|
|
text = m.group("quote_1") + "\n"
|
|
text = expand_leading_tab(text, 3)
|
|
text = _BLOCK_QUOTE_TRIM.sub("", text)
|
|
|
|
sc = self.compile_sc(["blank_line", "indent_code", "fenced_code"])
|
|
require_marker = bool(sc.match(text))
|
|
|
|
state.cursor = m.end() + 1
|
|
|
|
end_pos: Optional[int] = None
|
|
if require_marker:
|
|
m2 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
|
|
if m2:
|
|
quote = m2.group(0)
|
|
quote = _BLOCK_QUOTE_LEADING.sub("", quote)
|
|
quote = expand_leading_tab(quote, 3)
|
|
quote = _BLOCK_QUOTE_TRIM.sub("", quote)
|
|
text += quote
|
|
state.cursor = m2.end()
|
|
else:
|
|
prev_blank_line = False
|
|
break_sc = self.compile_sc(
|
|
[
|
|
"blank_line",
|
|
"thematic_break",
|
|
"fenced_code",
|
|
"list",
|
|
"block_html",
|
|
]
|
|
)
|
|
while state.cursor < state.cursor_max:
|
|
m3 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
|
|
if m3:
|
|
quote = m3.group(0)
|
|
quote = _BLOCK_QUOTE_LEADING.sub("", quote)
|
|
quote = expand_leading_tab(quote, 3)
|
|
quote = _BLOCK_QUOTE_TRIM.sub("", quote)
|
|
text += quote
|
|
state.cursor = m3.end()
|
|
if not quote.strip():
|
|
prev_blank_line = True
|
|
else:
|
|
prev_blank_line = bool(_LINE_BLANK_END.search(quote))
|
|
continue
|
|
|
|
if prev_blank_line:
|
|
# CommonMark Example 249
|
|
# because of laziness, a blank line is needed between
|
|
# a block quote and a following paragraph
|
|
break
|
|
|
|
m4 = break_sc.match(state.src, state.cursor)
|
|
if m4:
|
|
end_pos = self.parse_method(m4, state)
|
|
if end_pos:
|
|
break
|
|
|
|
# lazy continuation line
|
|
pos = state.find_line_end()
|
|
line = state.get_text(pos)
|
|
line = expand_leading_tab(line, 3)
|
|
text += line
|
|
state.cursor = pos
|
|
|
|
# according to CommonMark Example 6, the second tab should be
|
|
# treated as 4 spaces
|
|
return expand_tab(text), end_pos
|
|
|
|
def parse_block_quote(self, m: Match[str], state: BlockState) -> int:
|
|
"""Parse token for block quote. Here is an example of the syntax:
|
|
|
|
.. code-block:: markdown
|
|
|
|
> a block quote starts
|
|
> with right arrows
|
|
"""
|
|
text, end_pos = self.extract_block_quote(m, state)
|
|
# scan children state
|
|
child = state.child_state(text)
|
|
if state.depth() >= self.max_nested_level - 1:
|
|
rules = list(self.block_quote_rules)
|
|
rules.remove("block_quote")
|
|
else:
|
|
rules = self.block_quote_rules
|
|
|
|
self.parse(child, rules)
|
|
token = {"type": "block_quote", "children": child.tokens}
|
|
if end_pos:
|
|
state.prepend_token(token)
|
|
return end_pos
|
|
state.append_token(token)
|
|
return state.cursor
|
|
|
|
def parse_list(self, m: Match[str], state: BlockState) -> int:
|
|
"""Parse tokens for ordered and unordered list."""
|
|
return parse_list(self, m, state)
|
|
|
|
def parse_block_html(self, m: Match[str], state: BlockState) -> Optional[int]:
|
|
return self.parse_raw_html(m, state)
|
|
|
|
def parse_raw_html(self, m: Match[str], state: BlockState) -> Optional[int]:
|
|
marker = m.group(0).strip()
|
|
|
|
# rule 2
|
|
if marker == "<!--":
|
|
return _parse_html_to_end(state, "-->", m.end())
|
|
|
|
# rule 3
|
|
if marker == "<?":
|
|
return _parse_html_to_end(state, "?>", m.end())
|
|
|
|
# rule 5
|
|
if marker == "<![CDATA[":
|
|
return _parse_html_to_end(state, "]]>", m.end())
|
|
|
|
# rule 4
|
|
if marker.startswith("<!"):
|
|
return _parse_html_to_end(state, ">", m.end())
|
|
|
|
close_tag = None
|
|
open_tag = None
|
|
if marker.startswith("</"):
|
|
close_tag = marker[2:].lower()
|
|
# rule 6
|
|
if close_tag in BLOCK_TAGS:
|
|
return _parse_html_to_newline(state, self.BLANK_LINE)
|
|
else:
|
|
open_tag = marker[1:].lower()
|
|
# rule 1
|
|
if open_tag in PRE_TAGS:
|
|
end_tag = "</" + open_tag + ">"
|
|
return _parse_html_to_end(state, end_tag, m.end())
|
|
# rule 6
|
|
if open_tag in BLOCK_TAGS:
|
|
return _parse_html_to_newline(state, self.BLANK_LINE)
|
|
|
|
# Blocks of type 7 may not interrupt a paragraph.
|
|
end_pos = state.append_paragraph()
|
|
if end_pos:
|
|
return end_pos
|
|
|
|
# rule 7
|
|
start_pos = m.end()
|
|
end_pos = state.find_line_end()
|
|
if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or (
|
|
close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)
|
|
):
|
|
return _parse_html_to_newline(state, self.BLANK_LINE)
|
|
|
|
return None
|
|
|
|
def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None:
|
|
sc = self.compile_sc(rules)
|
|
|
|
while state.cursor < state.cursor_max:
|
|
m = sc.search(state.src, state.cursor)
|
|
if not m:
|
|
break
|
|
|
|
end_pos = m.start()
|
|
if end_pos > state.cursor:
|
|
text = state.get_text(end_pos)
|
|
state.add_paragraph(text)
|
|
state.cursor = end_pos
|
|
|
|
end_pos2 = self.parse_method(m, state)
|
|
if end_pos2:
|
|
state.cursor = end_pos2
|
|
else:
|
|
end_pos3 = state.find_line_end()
|
|
text = state.get_text(end_pos3)
|
|
state.add_paragraph(text)
|
|
state.cursor = end_pos3
|
|
|
|
if state.cursor < state.cursor_max:
|
|
text = state.src[state.cursor :]
|
|
state.add_paragraph(text)
|
|
state.cursor = state.cursor_max
|
|
|
|
|
|
def _parse_html_to_end(state: BlockState, end_marker: str, start_pos: int) -> int:
|
|
marker_pos = state.src.find(end_marker, start_pos)
|
|
if marker_pos == -1:
|
|
text = state.src[state.cursor :]
|
|
end_pos = state.cursor_max
|
|
else:
|
|
text = state.get_text(marker_pos)
|
|
state.cursor = marker_pos
|
|
end_pos = state.find_line_end()
|
|
text += state.get_text(end_pos)
|
|
|
|
state.append_token({"type": "block_html", "raw": text})
|
|
return end_pos
|
|
|
|
|
|
def _parse_html_to_newline(state: BlockState, newline: Pattern[str]) -> int:
|
|
m = newline.search(state.src, state.cursor)
|
|
if m:
|
|
end_pos = m.start()
|
|
text = state.get_text(end_pos)
|
|
else:
|
|
text = state.src[state.cursor :]
|
|
end_pos = state.cursor_max
|
|
|
|
state.append_token({"type": "block_html", "raw": text})
|
|
return end_pos
|