الملفات
Building-API-and-Monitoring…/venv/lib/python3.12/site-packages/mistune/block_parser.py
2025-09-27 23:34:33 +03:00

499 أسطر
16 KiB
Python

import re
from typing import Optional, List, Tuple, Match, Pattern
import string
from .util import (
unikey,
escape_url,
expand_tab,
expand_leading_tab,
)
from .core import Parser, BlockState
from .helpers import (
LINK_LABEL,
HTML_TAGNAME,
HTML_ATTRIBUTES,
BLOCK_TAGS,
PRE_TAGS,
unescape_char,
parse_link_href,
parse_link_title,
)
from .list_parser import parse_list, LIST_PATTERN
_INDENT_CODE_TRIM = re.compile(r"^ {1,4}", flags=re.M)
_ATX_HEADING_TRIM = re.compile(r"(\s+|^)#+\s*$")
_BLOCK_QUOTE_TRIM = re.compile(r"^ ?", flags=re.M)
_BLOCK_QUOTE_LEADING = re.compile(r"^ *>", flags=re.M)
_LINE_BLANK_END = re.compile(r"\n[ \t]*\n$")
_BLANK_TO_LINE = re.compile(r"[ \t]*\n")
_BLOCK_TAGS_PATTERN = "(" + "|".join(BLOCK_TAGS) + "|" + "|".join(PRE_TAGS) + ")"
_OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r"[ \t]*>[ \t]*(?:\n|$)")
_CLOSE_TAG_END = re.compile(r"[ \t]*>[ \t]*(?:\n|$)")
_STRICT_BLOCK_QUOTE = re.compile(r"( {0,3}>[^\n]*(?:\n|$))+")
class BlockParser(Parser[BlockState]):
state_cls = BlockState
BLANK_LINE = re.compile(r"(^[ \t\v\f]*\n)+", re.M)
RAW_HTML = (
r"^ {0,3}("
r"</?" + HTML_TAGNAME + r"|"
r"<!--|" # comment
r"<\?|" # script
r"<![A-Z]|"
r"<!\[CDATA\[)"
)
BLOCK_HTML = (
r"^ {0,3}(?:"
r"(?:</?" + _BLOCK_TAGS_PATTERN + r"(?:[ \t]+|\n|$))"
r"|<!--" # comment
r"|<\?" # script
r"|<![A-Z]"
r"|<!\[CDATA\[)"
)
SPECIFICATION = {
"blank_line": r"(^[ \t\v\f]*\n)+",
"atx_heading": r"^ {0,3}(?P<atx_1>#{1,6})(?!#+)(?P<atx_2>[ \t]*|[ \t]+.*?)$",
"setex_heading": r"^ {0,3}(?P<setext_1>=|-){1,}[ \t]*$",
"fenced_code": (
r"^(?P<fenced_1> {0,3})(?P<fenced_2>`{3,}|~{3,})"
r"[ \t]*(?P<fenced_3>.*?)$"
),
"indent_code": (
r"^(?: {4}| *\t)[^\n]+(?:\n+|$)"
r"((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*"
),
"thematic_break": r"^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$",
"ref_link": r"^ {0,3}\[(?P<reflink_1>" + LINK_LABEL + r")\]:",
"block_quote": r"^ {0,3}>(?P<quote_1>.*?)$",
"list": LIST_PATTERN,
"block_html": BLOCK_HTML,
"raw_html": RAW_HTML,
}
DEFAULT_RULES = (
"fenced_code",
"indent_code",
"atx_heading",
"setex_heading",
"thematic_break",
"block_quote",
"list",
"ref_link",
"raw_html",
"blank_line",
)
def __init__(
self,
block_quote_rules: Optional[List[str]] = None,
list_rules: Optional[List[str]] = None,
max_nested_level: int = 6,
):
super(BlockParser, self).__init__()
if block_quote_rules is None:
block_quote_rules = list(self.DEFAULT_RULES)
if list_rules is None:
list_rules = list(self.DEFAULT_RULES)
self.block_quote_rules = block_quote_rules
self.list_rules = list_rules
self.max_nested_level = max_nested_level
# register default parse methods
self._methods = {name: getattr(self, "parse_" + name) for name in self.SPECIFICATION}
def parse_blank_line(self, m: Match[str], state: BlockState) -> int:
"""Parse token for blank lines."""
state.append_token({"type": "blank_line"})
return m.end()
def parse_thematic_break(self, m: Match[str], state: BlockState) -> int:
"""Parse token for thematic break, e.g. ``<hr>`` tag in HTML."""
state.append_token({"type": "thematic_break"})
# $ does not count '\n'
return m.end() + 1
def parse_indent_code(self, m: Match[str], state: BlockState) -> int:
"""Parse token for code block which is indented by 4 spaces."""
# it is a part of the paragraph
end_pos = state.append_paragraph()
if end_pos:
return end_pos
code = m.group(0)
code = expand_leading_tab(code)
code = _INDENT_CODE_TRIM.sub("", code)
code = code.strip("\n")
state.append_token({"type": "block_code", "raw": code, "style": "indent"})
return m.end()
def parse_fenced_code(self, m: Match[str], state: BlockState) -> Optional[int]:
"""Parse token for fenced code block. A fenced code block is started with
3 or more backtick(`) or tilde(~).
An example of a fenced code block:
.. code-block:: markdown
```python
def markdown(text):
return mistune.html(text)
```
"""
spaces = m.group("fenced_1")
marker = m.group("fenced_2")
info = m.group("fenced_3")
c = marker[0]
if info and c == "`":
# CommonMark Example 145
# Info strings for backtick code blocks cannot contain backticks
if info.find(c) != -1:
return None
_end = re.compile(r"^ {0,3}" + c + "{" + str(len(marker)) + r",}[ \t]*(?:\n|$)", re.M)
cursor_start = m.end() + 1
m2 = _end.search(state.src, cursor_start)
if m2:
code = state.src[cursor_start : m2.start()]
end_pos = m2.end()
else:
code = state.src[cursor_start:]
end_pos = state.cursor_max
if spaces and code:
_trim_pattern = re.compile("^ {0," + str(len(spaces)) + "}", re.M)
code = _trim_pattern.sub("", code)
token = {"type": "block_code", "raw": code, "style": "fenced", "marker": marker}
if info:
info = unescape_char(info)
token["attrs"] = {"info": info.strip()}
state.append_token(token)
return end_pos
def parse_atx_heading(self, m: Match[str], state: BlockState) -> int:
"""Parse token for ATX heading. An ATX heading is started with 1 to 6
symbol of ``#``."""
level = len(m.group("atx_1"))
text = m.group("atx_2").strip(string.whitespace)
# remove last #
if text:
text = _ATX_HEADING_TRIM.sub("", text)
token = {"type": "heading", "text": text, "attrs": {"level": level}, "style": "atx"}
state.append_token(token)
return m.end() + 1
def parse_setex_heading(self, m: Match[str], state: BlockState) -> Optional[int]:
"""Parse token for setex style heading. A setex heading syntax looks like:
.. code-block:: markdown
H1 title
========
"""
last_token = state.last_token()
if last_token and last_token["type"] == "paragraph":
level = 1 if m.group("setext_1") == "=" else 2
last_token["type"] = "heading"
last_token["style"] = "setext"
last_token["attrs"] = {"level": level}
return m.end() + 1
sc = self.compile_sc(["thematic_break", "list"])
m2 = sc.match(state.src, state.cursor)
if m2:
return self.parse_method(m2, state)
return None
def parse_ref_link(self, m: Match[str], state: BlockState) -> Optional[int]:
"""Parse link references and save the link information into ``state.env``.
Here is an example of a link reference:
.. code-block:: markdown
a [link][example]
[example]: https://example.com "Optional title"
This method will save the link reference into ``state.env`` as::
state.env['ref_links']['example'] = {
'url': 'https://example.com',
'title': "Optional title",
}
"""
end_pos = state.append_paragraph()
if end_pos:
return end_pos
label = m.group("reflink_1")
key = unikey(label)
if not key:
return None
href, href_pos = parse_link_href(state.src, m.end(), block=True)
if href is None:
return None
assert href_pos is not None
_blank = self.BLANK_LINE.search(state.src, href_pos)
if _blank:
max_pos = _blank.start()
else:
max_pos = state.cursor_max
title, title_pos = parse_link_title(state.src, href_pos, max_pos)
if title_pos:
m2 = _BLANK_TO_LINE.match(state.src, title_pos)
if m2:
title_pos = m2.end()
else:
title_pos = None
title = None
if title_pos is None:
m3 = _BLANK_TO_LINE.match(state.src, href_pos)
if m3:
href_pos = m3.end()
else:
href_pos = None
href = None
end_pos = title_pos or href_pos
if not end_pos:
return None
if key not in state.env["ref_links"]:
assert href is not None
href = unescape_char(href)
data = {"url": escape_url(href), "label": label}
if title:
data["title"] = title
state.env["ref_links"][key] = data
return end_pos
def extract_block_quote(self, m: Match[str], state: BlockState) -> Tuple[str, Optional[int]]:
"""Extract text and cursor end position of a block quote."""
# cleanup at first to detect if it is code block
text = m.group("quote_1") + "\n"
text = expand_leading_tab(text, 3)
text = _BLOCK_QUOTE_TRIM.sub("", text)
sc = self.compile_sc(["blank_line", "indent_code", "fenced_code"])
require_marker = bool(sc.match(text))
state.cursor = m.end() + 1
end_pos: Optional[int] = None
if require_marker:
m2 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
if m2:
quote = m2.group(0)
quote = _BLOCK_QUOTE_LEADING.sub("", quote)
quote = expand_leading_tab(quote, 3)
quote = _BLOCK_QUOTE_TRIM.sub("", quote)
text += quote
state.cursor = m2.end()
else:
prev_blank_line = False
break_sc = self.compile_sc(
[
"blank_line",
"thematic_break",
"fenced_code",
"list",
"block_html",
]
)
while state.cursor < state.cursor_max:
m3 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor)
if m3:
quote = m3.group(0)
quote = _BLOCK_QUOTE_LEADING.sub("", quote)
quote = expand_leading_tab(quote, 3)
quote = _BLOCK_QUOTE_TRIM.sub("", quote)
text += quote
state.cursor = m3.end()
if not quote.strip():
prev_blank_line = True
else:
prev_blank_line = bool(_LINE_BLANK_END.search(quote))
continue
if prev_blank_line:
# CommonMark Example 249
# because of laziness, a blank line is needed between
# a block quote and a following paragraph
break
m4 = break_sc.match(state.src, state.cursor)
if m4:
end_pos = self.parse_method(m4, state)
if end_pos:
break
# lazy continuation line
pos = state.find_line_end()
line = state.get_text(pos)
line = expand_leading_tab(line, 3)
text += line
state.cursor = pos
# according to CommonMark Example 6, the second tab should be
# treated as 4 spaces
return expand_tab(text), end_pos
def parse_block_quote(self, m: Match[str], state: BlockState) -> int:
"""Parse token for block quote. Here is an example of the syntax:
.. code-block:: markdown
> a block quote starts
> with right arrows
"""
text, end_pos = self.extract_block_quote(m, state)
# scan children state
child = state.child_state(text)
if state.depth() >= self.max_nested_level - 1:
rules = list(self.block_quote_rules)
rules.remove("block_quote")
else:
rules = self.block_quote_rules
self.parse(child, rules)
token = {"type": "block_quote", "children": child.tokens}
if end_pos:
state.prepend_token(token)
return end_pos
state.append_token(token)
return state.cursor
def parse_list(self, m: Match[str], state: BlockState) -> int:
"""Parse tokens for ordered and unordered list."""
return parse_list(self, m, state)
def parse_block_html(self, m: Match[str], state: BlockState) -> Optional[int]:
return self.parse_raw_html(m, state)
def parse_raw_html(self, m: Match[str], state: BlockState) -> Optional[int]:
marker = m.group(0).strip()
# rule 2
if marker == "<!--":
return _parse_html_to_end(state, "-->", m.end())
# rule 3
if marker == "<?":
return _parse_html_to_end(state, "?>", m.end())
# rule 5
if marker == "<![CDATA[":
return _parse_html_to_end(state, "]]>", m.end())
# rule 4
if marker.startswith("<!"):
return _parse_html_to_end(state, ">", m.end())
close_tag = None
open_tag = None
if marker.startswith("</"):
close_tag = marker[2:].lower()
# rule 6
if close_tag in BLOCK_TAGS:
return _parse_html_to_newline(state, self.BLANK_LINE)
else:
open_tag = marker[1:].lower()
# rule 1
if open_tag in PRE_TAGS:
end_tag = "</" + open_tag + ">"
return _parse_html_to_end(state, end_tag, m.end())
# rule 6
if open_tag in BLOCK_TAGS:
return _parse_html_to_newline(state, self.BLANK_LINE)
# Blocks of type 7 may not interrupt a paragraph.
end_pos = state.append_paragraph()
if end_pos:
return end_pos
# rule 7
start_pos = m.end()
end_pos = state.find_line_end()
if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or (
close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos)
):
return _parse_html_to_newline(state, self.BLANK_LINE)
return None
def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None:
sc = self.compile_sc(rules)
while state.cursor < state.cursor_max:
m = sc.search(state.src, state.cursor)
if not m:
break
end_pos = m.start()
if end_pos > state.cursor:
text = state.get_text(end_pos)
state.add_paragraph(text)
state.cursor = end_pos
end_pos2 = self.parse_method(m, state)
if end_pos2:
state.cursor = end_pos2
else:
end_pos3 = state.find_line_end()
text = state.get_text(end_pos3)
state.add_paragraph(text)
state.cursor = end_pos3
if state.cursor < state.cursor_max:
text = state.src[state.cursor :]
state.add_paragraph(text)
state.cursor = state.cursor_max
def _parse_html_to_end(state: BlockState, end_marker: str, start_pos: int) -> int:
marker_pos = state.src.find(end_marker, start_pos)
if marker_pos == -1:
text = state.src[state.cursor :]
end_pos = state.cursor_max
else:
text = state.get_text(marker_pos)
state.cursor = marker_pos
end_pos = state.find_line_end()
text += state.get_text(end_pos)
state.append_token({"type": "block_html", "raw": text})
return end_pos
def _parse_html_to_newline(state: BlockState, newline: Pattern[str]) -> int:
m = newline.search(state.src, state.cursor)
if m:
end_pos = m.start()
text = state.get_text(end_pos)
else:
text = state.src[state.cursor :]
end_pos = state.cursor_max
state.append_token({"type": "block_html", "raw": text})
return end_pos