import re import string from typing import Any, Dict, Tuple, Union from .util import escape_url PREVENT_BACKSLASH = r"(?\n\\\x00]*)>") LINK_HREF_BLOCK_RE = re.compile(r"[ \t]*\n?[ \t]*([^\s]+)(?:\s|$)") LINK_HREF_INLINE_RE = re.compile( r"[ \t]*\n?[ \t]*([^ \t\n]*?)(?:[ \t\n]|" r"(?:" + PREVENT_BACKSLASH + r"\)))" ) LINK_TITLE_RE = re.compile( r"[ \t\n]+(" r'"(?:\\' + PUNCTUATION + r'|[^"\x00])*"|' # "title" r"'(?:\\" + PUNCTUATION + r"|[^'\x00])*'" # 'title' r")" ) PAREN_END_RE = re.compile(r"\s*\)") HTML_TAGNAME = r"[A-Za-z][A-Za-z0-9-]*" HTML_ATTRIBUTES = ( r"(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*" r'(?:\s*=\s*(?:[^ !"\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*' ) BLOCK_TAGS = ( "address", "article", "aside", "base", "basefont", "blockquote", "body", "caption", "center", "col", "colgroup", "dd", "details", "dialog", "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hr", "html", "iframe", "legend", "li", "link", "main", "menu", "menuitem", "meta", "nav", "noframes", "ol", "optgroup", "option", "p", "param", "section", "source", "summary", "table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul", ) PRE_TAGS = ("pre", "script", "style", "textarea") _INLINE_LINK_LABEL_RE = re.compile(LINK_LABEL + r"\]") _INLINE_SQUARE_BRACKET_RE = re.compile(PREVENT_BACKSLASH + r"[\[\]]") _ESCAPE_CHAR_RE = re.compile(r"\\(" + PUNCTUATION + r")") def unescape_char(text: str) -> str: return _ESCAPE_CHAR_RE.sub(r"\1", text) def parse_link_text(src: str, pos: int) -> Union[Tuple[str, int], Tuple[None, None]]: level = 1 found = False start_pos = pos while pos < len(src): m = _INLINE_SQUARE_BRACKET_RE.search(src, pos) if not m: break pos = m.end() marker = m.group(0) if marker == "]": level -= 1 if level == 0: found = True break else: level += 1 if found: text = src[start_pos : pos - 1] return text, pos return None, None def parse_link_label(src: str, start_pos: int) -> Union[Tuple[str, int], Tuple[None, None]]: m = _INLINE_LINK_LABEL_RE.match(src, start_pos) if m: label = m.group(0)[:-1] return label, m.end() return None, None def parse_link_href(src: str, start_pos: int, block: bool = False) -> Union[Tuple[str, int], Tuple[None, None]]: m = LINK_BRACKET_START.match(src, start_pos) if m: start_pos = m.end() - 1 m = LINK_BRACKET_RE.match(src, start_pos) if m: return m.group(1), m.end() return None, None if block: m = LINK_HREF_BLOCK_RE.match(src, start_pos) else: m = LINK_HREF_INLINE_RE.match(src, start_pos) if not m: return None, None end_pos = m.end() href = m.group(1) if block and src[end_pos - 1] == href[-1]: return href, end_pos return href, end_pos - 1 def parse_link_title(src: str, start_pos: int, max_pos: int) -> Union[Tuple[str, int], Tuple[None, None]]: m = LINK_TITLE_RE.match(src, start_pos, max_pos) if m: title = m.group(1)[1:-1] title = unescape_char(title) return title, m.end() return None, None def parse_link(src: str, pos: int) -> Union[Tuple[Dict[str, Any], int], Tuple[None, None]]: href, href_pos = parse_link_href(src, pos) if href is None: return None, None assert href_pos is not None title, title_pos = parse_link_title(src, href_pos, len(src)) next_pos = title_pos or href_pos m = PAREN_END_RE.match(src, next_pos) if not m: return None, None href = unescape_char(href) attrs = {"url": escape_url(href)} if title: attrs["title"] = title return attrs, m.end()