import re from typing import Optional, List, Tuple, Match, Pattern import string from .util import ( unikey, escape_url, expand_tab, expand_leading_tab, ) from .core import Parser, BlockState from .helpers import ( LINK_LABEL, HTML_TAGNAME, HTML_ATTRIBUTES, BLOCK_TAGS, PRE_TAGS, unescape_char, parse_link_href, parse_link_title, ) from .list_parser import parse_list, LIST_PATTERN _INDENT_CODE_TRIM = re.compile(r"^ {1,4}", flags=re.M) _ATX_HEADING_TRIM = re.compile(r"(\s+|^)#+\s*$") _BLOCK_QUOTE_TRIM = re.compile(r"^ ?", flags=re.M) _BLOCK_QUOTE_LEADING = re.compile(r"^ *>", flags=re.M) _LINE_BLANK_END = re.compile(r"\n[ \t]*\n$") _BLANK_TO_LINE = re.compile(r"[ \t]*\n") _BLOCK_TAGS_PATTERN = "(" + "|".join(BLOCK_TAGS) + "|" + "|".join(PRE_TAGS) + ")" _OPEN_TAG_END = re.compile(HTML_ATTRIBUTES + r"[ \t]*>[ \t]*(?:\n|$)") _CLOSE_TAG_END = re.compile(r"[ \t]*>[ \t]*(?:\n|$)") _STRICT_BLOCK_QUOTE = re.compile(r"( {0,3}>[^\n]*(?:\n|$))+") class BlockParser(Parser[BlockState]): state_cls = BlockState BLANK_LINE = re.compile(r"(^[ \t\v\f]*\n)+", re.M) RAW_HTML = ( r"^ {0,3}(" r"#{1,6})(?!#+)(?P[ \t]*|[ \t]+.*?)$", "setex_heading": r"^ {0,3}(?P=|-){1,}[ \t]*$", "fenced_code": ( r"^(?P {0,3})(?P`{3,}|~{3,})" r"[ \t]*(?P.*?)$" ), "indent_code": ( r"^(?: {4}| *\t)[^\n]+(?:\n+|$)" r"((?:(?: {4}| *\t)[^\n]+(?:\n+|$))|\s)*" ), "thematic_break": r"^ {0,3}((?:-[ \t]*){3,}|(?:_[ \t]*){3,}|(?:\*[ \t]*){3,})$", "ref_link": r"^ {0,3}\[(?P" + LINK_LABEL + r")\]:", "block_quote": r"^ {0,3}>(?P.*?)$", "list": LIST_PATTERN, "block_html": BLOCK_HTML, "raw_html": RAW_HTML, } DEFAULT_RULES = ( "fenced_code", "indent_code", "atx_heading", "setex_heading", "thematic_break", "block_quote", "list", "ref_link", "raw_html", "blank_line", ) def __init__( self, block_quote_rules: Optional[List[str]] = None, list_rules: Optional[List[str]] = None, max_nested_level: int = 6, ): super(BlockParser, self).__init__() if block_quote_rules is None: block_quote_rules = list(self.DEFAULT_RULES) if list_rules is None: list_rules = list(self.DEFAULT_RULES) self.block_quote_rules = block_quote_rules self.list_rules = list_rules self.max_nested_level = max_nested_level # register default parse methods self._methods = {name: getattr(self, "parse_" + name) for name in self.SPECIFICATION} def parse_blank_line(self, m: Match[str], state: BlockState) -> int: """Parse token for blank lines.""" state.append_token({"type": "blank_line"}) return m.end() def parse_thematic_break(self, m: Match[str], state: BlockState) -> int: """Parse token for thematic break, e.g. ``
`` tag in HTML.""" state.append_token({"type": "thematic_break"}) # $ does not count '\n' return m.end() + 1 def parse_indent_code(self, m: Match[str], state: BlockState) -> int: """Parse token for code block which is indented by 4 spaces.""" # it is a part of the paragraph end_pos = state.append_paragraph() if end_pos: return end_pos code = m.group(0) code = expand_leading_tab(code) code = _INDENT_CODE_TRIM.sub("", code) code = code.strip("\n") state.append_token({"type": "block_code", "raw": code, "style": "indent"}) return m.end() def parse_fenced_code(self, m: Match[str], state: BlockState) -> Optional[int]: """Parse token for fenced code block. A fenced code block is started with 3 or more backtick(`) or tilde(~). An example of a fenced code block: .. code-block:: markdown ```python def markdown(text): return mistune.html(text) ``` """ spaces = m.group("fenced_1") marker = m.group("fenced_2") info = m.group("fenced_3") c = marker[0] if info and c == "`": # CommonMark Example 145 # Info strings for backtick code blocks cannot contain backticks if info.find(c) != -1: return None _end = re.compile(r"^ {0,3}" + c + "{" + str(len(marker)) + r",}[ \t]*(?:\n|$)", re.M) cursor_start = m.end() + 1 m2 = _end.search(state.src, cursor_start) if m2: code = state.src[cursor_start : m2.start()] end_pos = m2.end() else: code = state.src[cursor_start:] end_pos = state.cursor_max if spaces and code: _trim_pattern = re.compile("^ {0," + str(len(spaces)) + "}", re.M) code = _trim_pattern.sub("", code) token = {"type": "block_code", "raw": code, "style": "fenced", "marker": marker} if info: info = unescape_char(info) token["attrs"] = {"info": info.strip()} state.append_token(token) return end_pos def parse_atx_heading(self, m: Match[str], state: BlockState) -> int: """Parse token for ATX heading. An ATX heading is started with 1 to 6 symbol of ``#``.""" level = len(m.group("atx_1")) text = m.group("atx_2").strip(string.whitespace) # remove last # if text: text = _ATX_HEADING_TRIM.sub("", text) token = {"type": "heading", "text": text, "attrs": {"level": level}, "style": "atx"} state.append_token(token) return m.end() + 1 def parse_setex_heading(self, m: Match[str], state: BlockState) -> Optional[int]: """Parse token for setex style heading. A setex heading syntax looks like: .. code-block:: markdown H1 title ======== """ last_token = state.last_token() if last_token and last_token["type"] == "paragraph": level = 1 if m.group("setext_1") == "=" else 2 last_token["type"] = "heading" last_token["style"] = "setext" last_token["attrs"] = {"level": level} return m.end() + 1 sc = self.compile_sc(["thematic_break", "list"]) m2 = sc.match(state.src, state.cursor) if m2: return self.parse_method(m2, state) return None def parse_ref_link(self, m: Match[str], state: BlockState) -> Optional[int]: """Parse link references and save the link information into ``state.env``. Here is an example of a link reference: .. code-block:: markdown a [link][example] [example]: https://example.com "Optional title" This method will save the link reference into ``state.env`` as:: state.env['ref_links']['example'] = { 'url': 'https://example.com', 'title': "Optional title", } """ end_pos = state.append_paragraph() if end_pos: return end_pos label = m.group("reflink_1") key = unikey(label) if not key: return None href, href_pos = parse_link_href(state.src, m.end(), block=True) if href is None: return None assert href_pos is not None _blank = self.BLANK_LINE.search(state.src, href_pos) if _blank: max_pos = _blank.start() else: max_pos = state.cursor_max title, title_pos = parse_link_title(state.src, href_pos, max_pos) if title_pos: m2 = _BLANK_TO_LINE.match(state.src, title_pos) if m2: title_pos = m2.end() else: title_pos = None title = None if title_pos is None: m3 = _BLANK_TO_LINE.match(state.src, href_pos) if m3: href_pos = m3.end() else: href_pos = None href = None end_pos = title_pos or href_pos if not end_pos: return None if key not in state.env["ref_links"]: assert href is not None href = unescape_char(href) data = {"url": escape_url(href), "label": label} if title: data["title"] = title state.env["ref_links"][key] = data return end_pos def extract_block_quote(self, m: Match[str], state: BlockState) -> Tuple[str, Optional[int]]: """Extract text and cursor end position of a block quote.""" # cleanup at first to detect if it is code block text = m.group("quote_1") + "\n" text = expand_leading_tab(text, 3) text = _BLOCK_QUOTE_TRIM.sub("", text) sc = self.compile_sc(["blank_line", "indent_code", "fenced_code"]) require_marker = bool(sc.match(text)) state.cursor = m.end() + 1 end_pos: Optional[int] = None if require_marker: m2 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) if m2: quote = m2.group(0) quote = _BLOCK_QUOTE_LEADING.sub("", quote) quote = expand_leading_tab(quote, 3) quote = _BLOCK_QUOTE_TRIM.sub("", quote) text += quote state.cursor = m2.end() else: prev_blank_line = False break_sc = self.compile_sc( [ "blank_line", "thematic_break", "fenced_code", "list", "block_html", ] ) while state.cursor < state.cursor_max: m3 = _STRICT_BLOCK_QUOTE.match(state.src, state.cursor) if m3: quote = m3.group(0) quote = _BLOCK_QUOTE_LEADING.sub("", quote) quote = expand_leading_tab(quote, 3) quote = _BLOCK_QUOTE_TRIM.sub("", quote) text += quote state.cursor = m3.end() if not quote.strip(): prev_blank_line = True else: prev_blank_line = bool(_LINE_BLANK_END.search(quote)) continue if prev_blank_line: # CommonMark Example 249 # because of laziness, a blank line is needed between # a block quote and a following paragraph break m4 = break_sc.match(state.src, state.cursor) if m4: end_pos = self.parse_method(m4, state) if end_pos: break # lazy continuation line pos = state.find_line_end() line = state.get_text(pos) line = expand_leading_tab(line, 3) text += line state.cursor = pos # according to CommonMark Example 6, the second tab should be # treated as 4 spaces return expand_tab(text), end_pos def parse_block_quote(self, m: Match[str], state: BlockState) -> int: """Parse token for block quote. Here is an example of the syntax: .. code-block:: markdown > a block quote starts > with right arrows """ text, end_pos = self.extract_block_quote(m, state) # scan children state child = state.child_state(text) if state.depth() >= self.max_nested_level - 1: rules = list(self.block_quote_rules) rules.remove("block_quote") else: rules = self.block_quote_rules self.parse(child, rules) token = {"type": "block_quote", "children": child.tokens} if end_pos: state.prepend_token(token) return end_pos state.append_token(token) return state.cursor def parse_list(self, m: Match[str], state: BlockState) -> int: """Parse tokens for ordered and unordered list.""" return parse_list(self, m, state) def parse_block_html(self, m: Match[str], state: BlockState) -> Optional[int]: return self.parse_raw_html(m, state) def parse_raw_html(self, m: Match[str], state: BlockState) -> Optional[int]: marker = m.group(0).strip() # rule 2 if marker == "", m.end()) # rule 3 if marker == "", m.end()) # rule 5 if marker == "", m.end()) # rule 4 if marker.startswith("", m.end()) close_tag = None open_tag = None if marker.startswith("" return _parse_html_to_end(state, end_tag, m.end()) # rule 6 if open_tag in BLOCK_TAGS: return _parse_html_to_newline(state, self.BLANK_LINE) # Blocks of type 7 may not interrupt a paragraph. end_pos = state.append_paragraph() if end_pos: return end_pos # rule 7 start_pos = m.end() end_pos = state.find_line_end() if (open_tag and _OPEN_TAG_END.match(state.src, start_pos, end_pos)) or ( close_tag and _CLOSE_TAG_END.match(state.src, start_pos, end_pos) ): return _parse_html_to_newline(state, self.BLANK_LINE) return None def parse(self, state: BlockState, rules: Optional[List[str]] = None) -> None: sc = self.compile_sc(rules) while state.cursor < state.cursor_max: m = sc.search(state.src, state.cursor) if not m: break end_pos = m.start() if end_pos > state.cursor: text = state.get_text(end_pos) state.add_paragraph(text) state.cursor = end_pos end_pos2 = self.parse_method(m, state) if end_pos2: state.cursor = end_pos2 else: end_pos3 = state.find_line_end() text = state.get_text(end_pos3) state.add_paragraph(text) state.cursor = end_pos3 if state.cursor < state.cursor_max: text = state.src[state.cursor :] state.add_paragraph(text) state.cursor = state.cursor_max def _parse_html_to_end(state: BlockState, end_marker: str, start_pos: int) -> int: marker_pos = state.src.find(end_marker, start_pos) if marker_pos == -1: text = state.src[state.cursor :] end_pos = state.cursor_max else: text = state.get_text(marker_pos) state.cursor = marker_pos end_pos = state.find_line_end() text += state.get_text(end_pos) state.append_token({"type": "block_html", "raw": text}) return end_pos def _parse_html_to_newline(state: BlockState, newline: Pattern[str]) -> int: m = newline.search(state.src, state.cursor) if m: end_pos = m.start() text = state.get_text(end_pos) else: text = state.src[state.cursor :] end_pos = state.cursor_max state.append_token({"type": "block_html", "raw": text}) return end_pos