import re from typing import ( Any, Dict, List, Match, MutableMapping, Optional, ) from .core import InlineState, Parser from .helpers import ( HTML_ATTRIBUTES, HTML_TAGNAME, PREVENT_BACKSLASH, PUNCTUATION, parse_link, parse_link_label, parse_link_text, unescape_char, ) from .util import escape_url, unikey PAREN_END_RE = re.compile(r"\s*\)") AUTO_EMAIL = ( r"""<[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9]""" r"(?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?" r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>" ) INLINE_HTML = ( r"<" + HTML_TAGNAME + HTML_ATTRIBUTES + r"\s*/?>|" # open tag r"|" # close tag r"|" # comment r"<\?[\s\S]+?\?>|" # script like r"|" # doctype r"" # cdata ) EMPHASIS_END_RE = { "*": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*(?!\*)"), "_": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])_(?!_)\b"), "**": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*(?!\*)"), "__": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])__(?!_)\b"), "***": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\\*|[^\s*])\*\*\*(?!\*)"), "___": re.compile(r"(?:" + PREVENT_BACKSLASH + r"\\_|[^\s_])___(?!_)\b"), } class InlineParser(Parser[InlineState]): sc_flag = 0 state_cls = InlineState #: linebreak leaves two spaces at the end of line STD_LINEBREAK = r"(?:\\| {2,})\n\s*" #: every new line becomes
HARD_LINEBREAK = r" *\n\s*" # we only need to find the start pattern of an inline token SPECIFICATION = { # e.g. \`, \$ "escape": r"(?:\\" + PUNCTUATION + ")+", # `code, ```code "codespan": r"`{1,}", # *w, **w, _w, __w "emphasis": r"\*{1,3}(?=[^\s*])|\b_{1,3}(?=[^\s_])", # [link], ![img] "link": r"!?\[", # . regex copied from commonmark.js "auto_link": r"<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>", "auto_email": AUTO_EMAIL, "inline_html": INLINE_HTML, "linebreak": STD_LINEBREAK, "softbreak": HARD_LINEBREAK, "prec_auto_link": r"<[A-Za-z][A-Za-z\d.+-]{1,31}:", "prec_inline_html": r" None: super(InlineParser, self).__init__() self.hard_wrap = hard_wrap # lazy add linebreak if hard_wrap: self.specification["linebreak"] = self.HARD_LINEBREAK else: self.rules.append("softbreak") self._methods = {name: getattr(self, "parse_" + name) for name in self.rules} def parse_escape(self, m: Match[str], state: InlineState) -> int: text = m.group(0) text = unescape_char(text) state.append_token( { "type": "text", "raw": text, } ) return m.end() def parse_link(self, m: Match[str], state: InlineState) -> Optional[int]: pos = m.end() marker = m.group(0) is_image = marker[0] == "!" if is_image and state.in_image: state.append_token({"type": "text", "raw": marker}) return pos elif not is_image and state.in_link: state.append_token({"type": "text", "raw": marker}) return pos text = None label, end_pos = parse_link_label(state.src, pos) if label is None: text, end_pos = parse_link_text(state.src, pos) if text is None: return None assert end_pos is not None if text is None: text = label assert text is not None if end_pos >= len(state.src) and label is None: return None rules = ["codespan", "prec_auto_link", "prec_inline_html"] prec_pos = self.precedence_scan(m, state, end_pos, rules) if prec_pos: return prec_pos if end_pos < len(state.src): c = state.src[end_pos] if c == "(": # standard link [text]( "title") attrs, pos2 = parse_link(state.src, end_pos + 1) if pos2: token = self.__parse_link_token(is_image, text, attrs, state) state.append_token(token) return pos2 elif c == "[": # standard ref link [text][label] label2, pos2 = parse_link_label(state.src, end_pos + 1) if pos2: end_pos = pos2 if label2: label = label2 if label is None: return None ref_links = state.env.get("ref_links") if not ref_links: return None key = unikey(label) env = ref_links.get(key) if env: attrs = {"url": env["url"], "title": env.get("title")} token = self.__parse_link_token(is_image, text, attrs, state) token["ref"] = key token["label"] = label state.append_token(token) return end_pos return None def __parse_link_token( self, is_image: bool, text: str, attrs: Optional[Dict[str, Any]], state: InlineState, ) -> Dict[str, Any]: new_state = state.copy() new_state.src = text if is_image: new_state.in_image = True token = { "type": "image", "children": self.render(new_state), "attrs": attrs, } else: new_state.in_link = True token = { "type": "link", "children": self.render(new_state), "attrs": attrs, } return token def parse_auto_link(self, m: Match[str], state: InlineState) -> int: text = m.group(0) pos = m.end() if state.in_link: self.process_text(text, state) return pos text = text[1:-1] self._add_auto_link(text, text, state) return pos def parse_auto_email(self, m: Match[str], state: InlineState) -> int: text = m.group(0) pos = m.end() if state.in_link: self.process_text(text, state) return pos text = text[1:-1] url = "mailto:" + text self._add_auto_link(url, text, state) return pos def _add_auto_link(self, url: str, text: str, state: InlineState) -> None: state.append_token( { "type": "link", "children": [{"type": "text", "raw": text}], "attrs": {"url": escape_url(url)}, } ) def parse_emphasis(self, m: Match[str], state: InlineState) -> int: pos = m.end() marker = m.group(0) mlen = len(marker) if mlen == 1 and state.in_emphasis: state.append_token({"type": "text", "raw": marker}) return pos elif mlen == 2 and state.in_strong: state.append_token({"type": "text", "raw": marker}) return pos _end_re = EMPHASIS_END_RE[marker] m1 = _end_re.search(state.src, pos) if not m1: state.append_token({"type": "text", "raw": marker}) return pos end_pos = m1.end() text = state.src[pos : end_pos - mlen] prec_pos = self.precedence_scan(m, state, end_pos) if prec_pos: return prec_pos new_state = state.copy() new_state.src = text if mlen == 1: new_state.in_emphasis = True children = self.render(new_state) state.append_token({"type": "emphasis", "children": children}) elif mlen == 2: new_state.in_strong = True children = self.render(new_state) state.append_token({"type": "strong", "children": children}) else: new_state.in_emphasis = True new_state.in_strong = True children = [{"type": "strong", "children": self.render(new_state)}] state.append_token( { "type": "emphasis", "children": children, } ) return end_pos def parse_codespan(self, m: Match[str], state: InlineState) -> int: marker = m.group(0) # require same marker with same length at end pattern = re.compile(r"(.*?[^`])" + marker + r"(?!`)", re.S) pos = m.end() m2 = pattern.match(state.src, pos) if m2: end_pos = m2.end() code = m2.group(1) # Line endings are treated like spaces code = code.replace("\n", " ") if len(code.strip()): if code.startswith(" ") and code.endswith(" "): code = code[1:-1] state.append_token({"type": "codespan", "raw": code}) return end_pos else: state.append_token({"type": "text", "raw": marker}) return pos def parse_linebreak(self, m: Match[str], state: InlineState) -> int: state.append_token({"type": "linebreak"}) return m.end() def parse_softbreak(self, m: Match[str], state: InlineState) -> int: state.append_token({"type": "softbreak"}) return m.end() def parse_inline_html(self, m: Match[str], state: InlineState) -> int: end_pos = m.end() html = m.group(0) state.append_token({"type": "inline_html", "raw": html}) if html.startswith(("", "")): state.in_link = True elif html.startswith(("", "")): state.in_link = False return end_pos def process_text(self, text: str, state: InlineState) -> None: state.append_token({"type": "text", "raw": text}) def parse(self, state: InlineState) -> List[Dict[str, Any]]: pos = 0 sc = self.compile_sc() while pos < len(state.src): m = sc.search(state.src, pos) if not m: break end_pos = m.start() if end_pos > pos: hole = state.src[pos:end_pos] self.process_text(hole, state) new_pos = self.parse_method(m, state) if not new_pos: # move cursor 1 character forward pos = end_pos + 1 hole = state.src[end_pos:pos] self.process_text(hole, state) else: pos = new_pos if pos == 0: # special case, just pure text self.process_text(state.src, state) elif pos < len(state.src): self.process_text(state.src[pos:], state) return state.tokens def precedence_scan( self, m: Match[str], state: InlineState, end_pos: int, rules: Optional[List[str]] = None, ) -> Optional[int]: if rules is None: rules = ["codespan", "link", "prec_auto_link", "prec_inline_html"] mark_pos = m.end() sc = self.compile_sc(rules) m1 = sc.search(state.src, mark_pos, end_pos) if not m1: return None lastgroup = m1.lastgroup if not lastgroup: return None rule_name = lastgroup.replace("prec_", "") sc = self.compile_sc([rule_name]) m2 = sc.match(state.src, m1.start()) if not m2: return None func = self._methods[rule_name] new_state = state.copy() new_state.src = state.src m2_pos = func(m2, new_state) if not m2_pos or m2_pos < end_pos: return None raw_text = state.src[m.start() : m2.start()] state.append_token({"type": "text", "raw": raw_text}) for token in new_state.tokens: state.append_token(token) return m2_pos def render(self, state: InlineState) -> List[Dict[str, Any]]: self.parse(state) return state.tokens def __call__(self, s: str, env: MutableMapping[str, Any]) -> List[Dict[str, Any]]: state = self.state_cls(env) state.src = s return self.render(state)